## 라이브러리

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

## 데이터 불러오기

In [3]:
path = './data/'

df = pd.read_csv(path+'distance_merged.csv')

In [4]:
df

Unnamed: 0,Region_Name,Building_Age,Building_Use,JS_Price,JS_BA,Population,IR,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ,YearMonth,Shortest_Distance_to_Park
0,성동구,19,아파트,22000,59.97,292786.00,2.00,3.0,81.4,86.6,84.9,115.756491,77.5,38683.33,1.178712,555.986308,295.569643,1339.504535,201411,1028.419509
1,도봉구,18,아파트,16000,84.98,357523.33,3.00,4.3,72.5,78.6,74.9,106.866464,81.7,29800.00,0.848460,2027.675179,149.315861,765.750961,201103,627.870957
2,송파구,35,아파트,21000,46.54,654241.00,1.75,3.7,83.7,88.0,86.0,120.348837,80.9,68855.00,1.015461,255.774896,472.015224,2129.795311,201505,1405.703845
3,노원구,24,아파트,13000,59.28,588021.80,3.25,3.3,72.7,79.5,76.5,108.376186,80.1,23250.00,0.906348,283.565040,291.020714,517.741729,201107,1015.223086
4,관악구,14,아파트,34000,84.87,508137.00,2.50,4.5,79.2,85.6,82.7,121.038496,75.7,40303.85,1.274720,556.539907,273.268241,989.162559,201402,944.475850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,강서구,18,오피스텔,17000,28.45,567898.00,1.75,3.0,108.8,108.0,109.1,94.156094,180.2,27150.00,0.671993,1733.008129,386.728023,1161.499230,202206,29.148712
2996,노원구,27,아파트,15000,39.82,543267.00,1.50,4.1,93.8,98.1,94.4,89.535769,107.4,29095.45,0.730991,769.700633,196.588060,694.360317,201804,1469.233740
2997,성동구,15,아파트,25000,84.91,294446.00,2.75,3.2,76.1,82.6,80.3,114.344638,73.7,34142.86,1.084889,289.844463,129.800930,1369.175383,201304,527.038587
2998,영등포구,15,아파트,110000,156.66,366258.00,0.50,4.5,98.5,98.3,99.5,104.100145,135.1,157500.00,1.280663,174.634578,274.653099,3134.071537,202005,444.209048


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Region_Name                  3000 non-null   object 
 1   Building_Age                 3000 non-null   int64  
 2   Building_Use                 3000 non-null   object 
 3   JS_Price                     3000 non-null   int64  
 4   JS_BA                        3000 non-null   float64
 5   Population                   3000 non-null   float64
 6   IR                           3000 non-null   float64
 7   UR                           3000 non-null   float64
 8   LC_index                     3000 non-null   float64
 9   CA_index                     3000 non-null   float64
 10  TC_index                     3000 non-null   float64
 11  SDT_index                    3000 non-null   float64
 12  HSP_index                    3000 non-null   float64
 13  Sell_Price        

In [6]:
df.columns

Index(['Region_Name', 'Building_Age', 'Building_Use', 'JS_Price', 'JS_BA',
       'Population', 'IR', 'UR', 'LC_index', 'CA_index', 'TC_index',
       'SDT_index', 'HSP_index', 'Sell_Price', 'Crime_Rates',
       'Shortest_Distance_to_Subway', 'Shortest_Distance_to_School',
       'Shortest_Distance_to_Univ', 'YearMonth', 'Shortest_Distance_to_Park'],
      dtype='object')

## Categorical:

- Nominal(variables that have two or more categories, but which do not have an intrinsic order.)

    - Region_Name : 자치구 명
    - Building_Use : 건물 용도
    
- Ordinal(variables that have two or more categories just like nominal variables. Only the categories can also be ordered or ranked.)

    
## Numeric:

- Discrete
    - YearMonth : 년월
    - Building_Age : 건물연식
    - JS_Price : 전세가
   
- Continous
    - Sell_Price : 매매 가격
    - JS_BA = JS_Building Area : 임대 면적
    - lR = Interest Rate : 금리
    - UR = Unemployment Rate : 실업률
    - LC_index = Leading Composite index : 선행종합 지수
    - CA_index = Comprehensive Accompany index : 동행종합 지수
    - TC_index = Trailing Composite index : 후행종합 지수
    - SDT_index = Supply and Demand Trend index = 전세수급동향 지수
    - HSP_index = 
    - Population : 인구수
    - Crime_Rates : 범죄율
    - Shortest_Distance_to_Subway : 가장 가까운 지하철역과의 거리
    - Shortest_Distance_to_School : 가장 가까운 초중고등학교와의 거리
    - Shortest_Distance_to_Univ : 가장 가까운 대학교와의 거리
    - Shortest_Distance_to_Park : 가장 가까운 공원과의 거리
    

## PCC

In [7]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df.columns:
    if column != 'JS_Price':
        if df[column].dtype != object:
            correlation, p_value = pearsonr(df['JS_Price'], df[column])
            result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

In [8]:
result_df.sort_values(by='PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,Sell_Price,0.748109,0.0
1,JS_BA,0.607727,1.911767e-302
2,LC_index,0.309659,1.116502e-67
3,CA_index,0.308894,2.451757e-67
4,TC_index,0.306818,2.049165e-66
5,YearMonth,0.305647,6.742085000000001e-66
6,HSP_index,0.268737,8.561737e-51
7,Shortest_Distance_to_Univ,0.071912,8.071581e-05
8,UR,0.053656,0.003284606
9,Crime_Rates,0.051631,0.004674763


## ANOVA (continous vs discrete)

In [25]:
# def anova(df):
#     for column in df.columns:
#         if np.issubdtype(df[column].dtype, np.integer):
#             # 이 코드는 정수형 열에 대해서만 분산분석을 수행합니다.
#             f_statistic, p_value = stats.f_oneway(*[group for name, group in df.groupby(column)['JS_Price']])
#             print(column)
#             print("F-statistic:", f_statistic, "p-value:", p_value)

In [26]:
# anova(df)

## KCC (continous vs categorical)

- 범주형 변수 더미화

In [9]:
# 범주형 변수 더미화 함수, 범주형 변수의 범주 레벨 간의 관계가 중요할 시 사용
def oh_encoding(df):
    # DataFrame의 복사본을 만듭니다.
    encoded_df = df.copy()
    for column in df.columns:
        if df[column].dtype == object:
            encoded_df = pd.get_dummies(encoded_df, columns=[column], prefix=column)
            print(column)
    return encoded_df

df_encoded = oh_encoding(df)

Region_Name
Building_Use


In [10]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'KCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 Kendall 상관 계수 및 p-value 계산 및 저장
for column in df_encoded.columns:
    if column != 'JS_Price':
        if column.startswith('Region') or column.startswith('Building_Use'):  
            kendall_corr, p_value = kendalltau(df_encoded['JS_Price'], df_encoded[column])
            result_df = result_df.append({'Column_Name': column, 'KCC': kendall_corr, 'p-value': p_value}, ignore_index=True)

In [11]:
result_df.sort_values(by='KCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,KCC,p-value
0,Building_Use_아파트,0.253038,7.577532e-64
1,Region_Name_강남구,0.150088,1.436609e-23
2,Region_Name_송파구,0.14515,3.787261e-22
3,Region_Name_서초구,0.124688,9.369829000000001e-17
4,Region_Name_동작구,0.069843,3.220416e-06
5,Region_Name_성동구,0.047309,0.001610775
6,Region_Name_용산구,0.044741,0.002856693
7,Region_Name_종로구,0.043872,0.003446793
8,Region_Name_광진구,0.041761,0.005368454
9,Region_Name_중구,0.041297,0.005902305


- p-value값 고려
    - PCC 결과 a=0.05일 때 Population, Shortest_Distance_to_Park, SDT_index는 통계적으로 상관관계가 유의하지 않으므로 변수에서 제외한다.
    - KCC 결과 a=0.05일 때 Region_Name_관악구, Region_Name_서대문구, Region_Name_성북구, Region_Name_영등포구, Region_Name_동대문구는 통계적으로 상관관계가 유의하지 않으므로 변수에서 제외한다.
- correlation값 고려
    - PCC결과 상관계수 절댓값이 0.1이하인 변수 제외 -> Population, Shortest_Distance_to_Park, SDT_index, Shortest_Distance_to_Univ, UR, Crime_Rates, Shortest_Distance_to_School를 변수에서 제외
    - KCC결과 상관계수 절댓값이 0.1이하인 변수 제외 -> Building_Use_아파트, Region_Name_송파구, Region_Name_서초구, Region_Name_강남구, Region_Name_도봉구, Building_Use_연립다세대, Building_Use_오피스텔, Region_Name_노원구만을 변수에 포함