In [132]:
import pandas as pd
import numpy as np

### 전처리!

In [133]:
df = pd.read_csv('D:/DSL/EDA/Data/Original/Overall_002.csv', encoding = 'cp949')

df_h = df.loc[df['학제유형명'].str.contains('고등학교')]

df_h = df_h[['조사년도', '개방ID', '학교명', '시도명', '유초중등학교개황_학생수']]

df_h.head()

Unnamed: 0,조사년도,개방ID,학교명,시도명,유초중등학교개황_학생수
2,2009,1001306772,인천소방고등학교,인천,1305
9,2009,1003973424,왕신여자고등학교,전북,304
23,2009,1012180447,춘천한샘고등학교,강원,880
40,2009,1018631696,서운고등학교,인천,1275
43,2009,1020258724,순천미래과학고등학교,전남,623


In [134]:
df_h['학교_식별자'] = df_h['학교명'] + ' (' + df_h['시도명'] + ')'

df_h = df_h.drop(['학교명', '시도명'], axis = 1)
df_h.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35039 entries, 2 to 313566
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   조사년도          35039 non-null  int64 
 1   개방ID          35039 non-null  int64 
 2   유초중등학교개황_학생수  35039 non-null  int64 
 3   학교_식별자        35039 non-null  object
dtypes: int64(3), object(1)
memory usage: 1.3+ MB


In [135]:
location = pd.read_csv('D:/DSL/EDA/Data/location.csv', encoding = 'utf-8-sig')

location = location.loc[location['학교급구분'] == '고등학교']

def suffix(region):
    pattern = region[:4]
    if pattern not in ['경상북도', '경상남도', '전라북도', '전라남도', '충청북도', '충청남도']:
        return region[:2]
    else:
        return pattern[0] + pattern[2]
    
location['학교_식별자'] = location['학교명'] + ' (' + location['시도교육청명'].apply(suffix) + ')'

location = location[['학교_식별자', '소재지지번주소', '위도', '경도']]

location.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2385 entries, 0 to 11982
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   학교_식별자   2385 non-null   object 
 1   소재지지번주소  2385 non-null   object 
 2   위도       2385 non-null   float64
 3   경도       2385 non-null   float64
dtypes: float64(2), object(2)
memory usage: 93.2+ KB


### 위치에 병합, pivot table 형태로 표현

In [136]:
df_merged = df_h.merge(location, on = '학교_식별자', how = 'inner')

df_merged = df_merged.rename(columns = {'유초중등학교개황_학생수': '학생수'})

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34220 entries, 0 to 34219
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   조사년도     34220 non-null  int64  
 1   개방ID     34220 non-null  int64  
 2   학생수      34220 non-null  int64  
 3   학교_식별자   34220 non-null  object 
 4   소재지지번주소  34220 non-null  object 
 5   위도       34220 non-null  float64
 6   경도       34220 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 1.8+ MB


In [137]:
df_table = df_merged.pivot_table(index = ['학교_식별자', '소재지지번주소', '개방ID', '위도', '경도'], columns = '조사년도', values = '학생수')

df_table = df_table.fillna(0).astype(int)

df_table.to_csv('D:/DSL/EDA/Data/high_table.csv', sep = ',', encoding = 'cp949')

### 학생 수 예측

In [138]:
df_birth = pd.read_csv('D:/DSL/EDA/Data/지역별_출생률.csv', encoding = 'utf-8-sig')
df_birth['시군구별'] = df_birth['시군구별'].str.strip()

tmp = df_birth[df_birth.columns[1:]]
tmp.columns = tmp.columns.astype(int)
tmp = tmp.fillna(0).astype(int)
df_birth = pd.concat([df_birth['시군구별'], tmp], axis = 1)

df_birth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   시군구별    250 non-null    object
 1   1997    250 non-null    int64 
 2   1998    250 non-null    int64 
 3   1999    250 non-null    int64 
 4   2000    250 non-null    int64 
 5   2001    250 non-null    int64 
 6   2002    250 non-null    int64 
 7   2003    250 non-null    int64 
 8   2004    250 non-null    int64 
 9   2005    250 non-null    int64 
 10  2006    250 non-null    int64 
 11  2007    250 non-null    int64 
 12  2008    250 non-null    int64 
 13  2009    250 non-null    int64 
 14  2010    250 non-null    int64 
 15  2011    250 non-null    int64 
 16  2012    250 non-null    int64 
 17  2013    250 non-null    int64 
 18  2014    250 non-null    int64 
 19  2015    250 non-null    int64 
 20  2016    250 non-null    int64 
 21  2017    250 non-null    int64 
 22  2018    250 non-null    in

In [139]:
df_est = pd.DataFrame({'시군구별': df_birth['시군구별']})

for year in range(2015, 2040):  # 2012년부터 2036년까지 추정
    required_years = [year - 16, year - 17, year - 18]  # 해당 연도에 중학생이 되는 출생연도
    valid_years = [y for y in required_years if y in df_birth.columns]  # 실제 데이터에 존재하는 연도만 사용
    
    # 출생아 수 합산 (해당 연도가 존재하지 않는 경우 0으로 처리)
    df_est[f'{year}_해당지역예측학생수'] = df_birth[valid_years].sum(axis=1, numeric_only=True) if valid_years else 0

# 0이 포함된 행이 있을 경우 마지막 0이 위치한 이후 연도의 첫 번째, 두 번째 셀을 0으로 설정
for idx, row in df_est.iloc[:, 1:].iterrows():
    zero_indices = row[row == 0].index
    if len(zero_indices) > 0:
        last_zero_idx = zero_indices[-1]
        next_columns = df_est.columns[df_est.columns.get_loc(last_zero_idx) + 1:]
        if len(next_columns) > 0:
            df_est.loc[idx, next_columns[0]] = 0
        if len(next_columns) > 1:
            df_est.loc[idx, next_columns[1]] = 0

df_est

Unnamed: 0,시군구별,2015_해당지역예측학생수,2016_해당지역예측학생수,2017_해당지역예측학생수,2018_해당지역예측학생수,2019_해당지역예측학생수,2020_해당지역예측학생수,2021_해당지역예측학생수,2022_해당지역예측학생수,2023_해당지역예측학생수,...,2030_해당지역예측학생수,2031_해당지역예측학생수,2032_해당지역예측학생수,2033_해당지역예측학생수,2034_해당지역예측학생수,2035_해당지역예측학생수,2036_해당지역예측학생수,2037_해당지역예측학생수,2038_해당지역예측학생수,2039_해당지역예측학생수
0,서울특별시 종로구,6081,5852,5563,5075,4540,4153,3843,3578,3486,...,2878,2684,2657,2441,2179,2006,1846,1730,1521,1388
1,서울특별시 중구,4012,4170,4441,4611,4189,3872,3559,3478,3494,...,3278,3013,2950,2717,2500,2324,2206,2100,1904,1748
2,서울특별시 용산구,8629,8773,8709,8090,7243,6755,6528,6463,6651,...,6516,6072,5651,5139,4549,4155,3867,3724,3520,3356
3,서울특별시 성동구,13872,13892,13702,13108,12087,11466,10917,10512,10589,...,8327,8245,7938,7746,7339,6999,6482,5969,5468,5006
4,서울특별시 광진구,18552,17851,16313,14713,13220,12348,11544,10871,10677,...,9320,8952,8729,8065,7311,6501,5686,4973,4294,3993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,함양군,1291,1309,1213,1117,970,870,830,829,880,...,698,670,610,547,488,446,391,340,280,253
246,거창군,2229,2172,2009,1822,1567,1478,1355,1305,1294,...,1184,1107,1066,984,912,798,742,679,616,589
247,합천군,1707,1574,1431,1239,1166,1076,1011,889,903,...,685,609,550,466,422,385,369,330,290,241
248,제주시,17551,17740,17462,16466,15286,14358,13767,12968,12942,...,13244,12936,12973,12411,11792,10983,10239,9384,8731,8172


In [140]:
df_table = pd.read_csv('D:/DSL/EDA/Data/high_table.csv', encoding = 'cp949')

df_table = df_table.dropna(subset=['소재지지번주소'])

# '소재지도로명주소'에서 '시군구별' 값이 포함된 경우 매핑
def find_region(address):
    for region in df_est['시군구별']:
        if region in address:
            return region
    return None

# 학생 수 데이터에 '출생아수' 정보를 추가
df_table['시군구별'] = df_table['소재지지번주소'].apply(find_region)

# 매칭된 데이터와 출생아수 데이터 병합
df_final = df_table.merge(df_est, on='시군구별', how='left')

#열 순서 정렬
col_to_move = '시군구별'
target_index = 5

cols = [col for col in df_final.columns if col != col_to_move]
cols.insert(target_index, col_to_move)

df_final = df_final[cols]

# 예측학생 수 정수 변환
df_final.iloc[:, 19:] = df_final.iloc[:, 19:].fillna(0).astype(int)

df_final.head()

Unnamed: 0,학교_식별자,소재지지번주소,개방ID,위도,경도,시군구별,2009,2010,2011,2012,...,2030_해당지역예측학생수,2031_해당지역예측학생수,2032_해당지역예측학생수,2033_해당지역예측학생수,2034_해당지역예측학생수,2035_해당지역예측학생수,2036_해당지역예측학생수,2037_해당지역예측학생수,2038_해당지역예측학생수,2039_해당지역예측학생수
0,가곡고등학교 (강원),강원특별자치도 삼척시 가곡면 오저리 88,7225805163,37.146346,129.204827,삼척시,12,15,11,13,...,1376.0,1275.0,1220.0,1139.0,1059.0,1044.0,1065.0,1011.0,940.0,858.0
1,가락고등학교 (서울),서울특별시 송파구 송파동 172,5066465437,37.501076,127.116426,서울특별시 송파구,1669,1600,1553,1489,...,18823.0,17863.0,17400.0,16375.0,15043.0,13967.0,12782.0,11684.0,10544.0,9721.0
2,가림고등학교 (인천),인천광역시 서구 가좌동 30-92,9869282543,37.492619,126.681202,인천광역시 서구,1464,1381,1332,1230,...,15445.0,15302.0,14897.0,13635.0,12732.0,12099.0,11803.0,11029.0,10441.0,10499.0
3,가야고등학교 (부산),부산광역시 부산진구 가야동 산33-3,3730158329,35.146303,129.031088,부산광역시 부산진구,1576,1430,1304,1197,...,9572.0,9126.0,8748.0,8009.0,6998.0,6001.0,5181.0,4727.0,4530.0,4677.0
4,가온고등학교 (경기),경기도 안성시 발화동 215-7,8473934650,36.988421,127.276022,안성시,1103,1083,1069,1049,...,5318.0,4854.0,4521.0,4145.0,3679.0,3280.0,2875.0,2679.0,2521.0,2417.0


In [141]:
df_final.to_csv('D:/DSL/EDA/Data/고등학교_데이터셋.csv', index = False, sep = ',', encoding = 'cp949')

### 예측된 학생 수에 따라 고등학교의 예상 학생 수 회귀

In [142]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2349 entries, 0 to 2348
Data columns (total 46 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   학교_식별자          2349 non-null   object 
 1   소재지지번주소         2349 non-null   object 
 2   개방ID            2349 non-null   int64  
 3   위도              2349 non-null   float64
 4   경도              2349 non-null   float64
 5   시군구별            2331 non-null   object 
 6   2009            2349 non-null   int64  
 7   2010            2349 non-null   int64  
 8   2011            2349 non-null   int64  
 9   2012            2349 non-null   int64  
 10  2013            2349 non-null   int64  
 11  2014            2349 non-null   int64  
 12  2015            2349 non-null   int64  
 13  2016            2349 non-null   int64  
 14  2017            2349 non-null   int64  
 15  2018            2349 non-null   int64  
 16  2019            2349 non-null   int64  
 17  2020            2349 non-null   i

In [150]:
import numpy as np
from sklearn.linear_model import LinearRegression

df_final.columns

Index(['학교_식별자', '소재지지번주소', '개방ID', '위도', '경도', '시군구별', '2009', '2010', '2011',
       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023', '2015_해당지역예측학생수', '2016_해당지역예측학생수',
       '2017_해당지역예측학생수', '2018_해당지역예측학생수', '2019_해당지역예측학생수', '2020_해당지역예측학생수',
       '2021_해당지역예측학생수', '2022_해당지역예측학생수', '2023_해당지역예측학생수', '2024_해당지역예측학생수',
       '2025_해당지역예측학생수', '2026_해당지역예측학생수', '2027_해당지역예측학생수', '2028_해당지역예측학생수',
       '2029_해당지역예측학생수', '2030_해당지역예측학생수', '2031_해당지역예측학생수', '2032_해당지역예측학생수',
       '2033_해당지역예측학생수', '2034_해당지역예측학생수', '2035_해당지역예측학생수', '2036_해당지역예측학생수',
       '2037_해당지역예측학생수', '2038_해당지역예측학생수', '2039_해당지역예측학생수'],
      dtype='object')

In [None]:
from scipy.stats import pearsonr

train_X = np.array(df_final.iloc[:, 21:30])
train_y = np.array(df_final.iloc[:, 12:21])

def rowwise_correlation(A, B):
    """
    각 행별로 Pearson 상관 계수를 계산하여 반환하는 함수
    :param A: (n, m) 크기의 numpy 배열
    :param B: (n, m) 크기의 numpy 배열
    :return: (n,) 크기의 numpy 배열 (각 행의 Pearson 상관 계수)
    """
    return np.array([pearsonr(A[i], B[i])[0] for i in range(A.shape[0])])

corr = rowwise_correlation(train_X, train_y)
threshold = 0.8

indices = np.where(corr >= threshold)[0]

train_X = df_final.iloc[indices, 21:30]
train_y = df_final.iloc[indices, 12:21]

  return np.array([pearsonr(A[i], B[i])[0] for i in range(A.shape[0])])


Unnamed: 0,2015_해당지역예측학생수,2016_해당지역예측학생수,2017_해당지역예측학생수,2018_해당지역예측학생수,2019_해당지역예측학생수,2020_해당지역예측학생수,2021_해당지역예측학생수,2022_해당지역예측학생수,2023_해당지역예측학생수
0,2636.0,2718.0,2570.0,2311.0,1970.0,1814.0,1647.0,1560.0,1526.0
1,26843.0,26349.0,24846.0,22942.0,20417.0,18763.0,17405.0,16430.0,16624.0
2,15753.0,15428.0,14758.0,13624.0,12117.0,11063.0,10803.0,10944.0,11809.0
3,13620.0,13383.0,12909.0,12127.0,11046.0,10309.0,9791.0,9536.0,9820.0
6,2369.0,2345.0,2257.0,2029.0,1775.0,1541.0,1415.0,1304.0,1335.0
...,...,...,...,...,...,...,...,...,...
2343,13320.0,12985.0,12322.0,11797.0,10770.0,10235.0,9687.0,9557.0,9561.0
2344,18180.0,18430.0,17340.0,15735.0,13885.0,12978.0,12490.0,12460.0,13165.0
2345,13320.0,12985.0,12322.0,11797.0,10770.0,10235.0,9687.0,9557.0,9561.0
2347,15866.0,16223.0,16204.0,15536.0,14338.0,13216.0,12550.0,12112.0,12364.0


In [None]:
model = LinearRegression()
model.fit(train_X, train_y)

