In [1]:
import pandas as pd

In [2]:
# xlsx 확장자 파일 목록 가져오기

import os

files = []

for i in os.listdir():
    if i.endswith('.xlsx'):
        files.append(i)
        
print(files)

['전처리(22.04.03).xlsx', 'Preprocessed.xlsx', 'result.xlsx']


In [3]:
data = pd.read_excel(files[0]) # 데이터 불러오기
features = data.columns.tolist() # 칼럼 이름들 리스트화

In [4]:
features_to_remove = ['자치구', '동', '안심택배함수'] # x 데이터에 들어가지 않을 칼럼들

for i in features_to_remove:
    try:
        features.remove(i)
    except:
        print('There is no column named + ' +  i)

In [5]:
# x, y 분리하여 DataFrame 객체 생성

x = pd.DataFrame()

# x DataFrame에 데이터 복사

for i in features:
    try:
        x[i] = ''
        x[i] = data[i].astype(float)
    except:
        print(i)
        
# y DataFrame에 데이터 복사

y = data['안심택배함수'].astype(int)

In [6]:
# 성분의 크기에 따른 분산량 왜곡을 막기 위한 표준화 패키지

from sklearn.preprocessing import StandardScaler

In [7]:
# StandardScaler 객체를 생성해 표준화 진행 후 DataFrame으로 변환하여 저장

x_trans = StandardScaler().fit_transform(x.values)
x_trans = pd.DataFrame(x_trans, columns = features)

In [8]:
# 선형 차원 축소 중 주성분 분석 (Principal Component Analysis) 패키지

from sklearn.decomposition import PCA

In [9]:
# 주성분 분석을 통해 적절한 수준의 설명력을 가진 주성분 수 선정

for i in range(10,15):
    pca = PCA(n_components = i)
    pca.fit_transform(x_trans)
    print('The number of Principal components : {} \n Explained Variance Ratios : {} \n The Sum of Explained Variance Ratios : {}'
      .format(i, pca.explained_variance_ratio_, sum(pca.explained_variance_ratio_)))
    print()

The number of Principal components : 10 
 Explained Variance Ratios : [0.24551941 0.13881536 0.11325475 0.09884774 0.07043014 0.05871246
 0.04186105 0.03976766 0.03176228 0.02947055] 
 The Sum of Explained Variance Ratios : 0.8684413984029842

The number of Principal components : 11 
 Explained Variance Ratios : [0.24551941 0.13881536 0.11325475 0.09884774 0.07043014 0.05871246
 0.04186105 0.03976766 0.03176228 0.02947055 0.02865634] 
 The Sum of Explained Variance Ratios : 0.8970977344066479

The number of Principal components : 12 
 Explained Variance Ratios : [0.24551941 0.13881536 0.11325475 0.09884774 0.07043014 0.05871246
 0.04186105 0.03976766 0.03176228 0.02947055 0.02865634 0.02517014] 
 The Sum of Explained Variance Ratios : 0.9222678780651069

The number of Principal components : 13 
 Explained Variance Ratios : [0.24551941 0.13881536 0.11325475 0.09884774 0.07043014 0.05871246
 0.04186105 0.03976766 0.03176228 0.02947055 0.02865634 0.02517014
 0.01891235] 
 The Sum of Expla

In [10]:
# 주성분 12개가 설명력 0.9를 처음 초과하므로 11개를 선택했다.
# 선택할 설명력 수준은 추후 논의

n = 12
pca = PCA(n_components = n)
pca_values = pca.fit_transform(x_trans)

print('The number of Principal components : {} \n Explained Variance Ratios : {} \n The Sum of Explained Variance Ratios : {}'
  .format(n, pca.explained_variance_ratio_, sum(pca.explained_variance_ratio_)))

The number of Principal components : 12 
 Explained Variance Ratios : [0.24551941 0.13881536 0.11325475 0.09884774 0.07043014 0.05871246
 0.04186105 0.03976766 0.03176228 0.02947055 0.02865634 0.02517014] 
 The Sum of Explained Variance Ratios : 0.9222678780651069


In [11]:
names = []
ch = 'PC '
for i in range(1, n + 1):
    names.append(ch + str(i))

In [12]:
# Data Frame으로 변환 후 csv 파일로 출력

principal_df = pd.DataFrame(data = pca_values, columns = names)
#principal_df.to_csv('Full_PCA.csv', encoding = 'cp949') # cp949는 한글 인코딩

In [13]:
import numpy as np

#각 주성분 상관 분석

high = []

for i in names:
    print(' The number of principal components : {} \n Correlation with y : {}'.format(i, np.corrcoef(principal_df[i], y)[0,1]))
    print()
    if abs(np.corrcoef(principal_df[i], y)[0,1]) > 0.1:
        high.append(i)

print(high)

 The number of principal components : PC 1 
 Correlation with y : 0.18988064187187953

 The number of principal components : PC 2 
 Correlation with y : 0.21315694984571298

 The number of principal components : PC 3 
 Correlation with y : 0.15010704010347245

 The number of principal components : PC 4 
 Correlation with y : 0.018022097357945306

 The number of principal components : PC 5 
 Correlation with y : -0.0015220667406586012

 The number of principal components : PC 6 
 Correlation with y : -0.018512833144727715

 The number of principal components : PC 7 
 Correlation with y : -0.020500378425988085

 The number of principal components : PC 8 
 Correlation with y : 0.018664252116240256

 The number of principal components : PC 9 
 Correlation with y : -0.006415762807513683

 The number of principal components : PC 10 
 Correlation with y : -0.03781451896601686

 The number of principal components : PC 11 
 Correlation with y : 0.013510699163539445

 The number of principal com

### 예측

In [14]:
# 선형 회귀 패키지

from sklearn.linear_model import LinearRegression

In [15]:
linear = LinearRegression() # 객체 생성
linear.fit(principal_df, y) # 모델 적합
error_linear = y - linear.predict(principal_df)

In [16]:
# 에러가 -1 미만일 경우 한 개 이상의 무인 택배함이 더 필요하다는 것을 뜻한다. 

lin_reg = pd.DataFrame(columns = ['동', '오차'])
counter = 0
number = 0

for i in error_linear:
    if i < -0.5:
        lin_reg.at[number, '동'] = data.iloc[counter, 1]
        lin_reg.at[number, '오차'] = i
        number += 1
    counter += 1
lin_reg

Unnamed: 0,동,오차
0,종로1.2.3.4가동,-0.508192
1,이화동,-0.512557
2,숭인2동,-0.518333
3,후암동,-0.604842
4,왕십리2동,-0.508885
...,...,...
115,성내3동,-0.677889
116,둔촌1동,-0.554861
117,둔촌2동,-0.697965
118,천호2동,-0.700387


In [17]:
# https://blog.naver.com/PostView.nhn?blogId=ssdyka&logNo=221231456916
# 릿지 회귀에 관한 글
# 릿지 회귀를 위한 패키지

from sklearn.linear_model import Ridge

In [18]:
ridge = Ridge(alpha = 0.5) # alpha 값이 의미하는 것도 정확히 이해되지 않고, 값을 바꿔도 결과는 크게 달라지지 않는 것 같다.
ridge.fit(principal_df, y) # 모델 적합
error_ridge = y - ridge.predict(principal_df)

In [19]:
counter = 0
number = 0

for i in error_ridge:
    if i < -0.5:
        print(' Value : {} \n Index : {} \n Name : {}'.format(i, counter, data.iloc[counter, 1]))
        print()
        number += 1
    counter += 1
print('The number of big error : {}'.format(number))

 Value : -0.5080483733286928 
 Index : 7 
 Name : 종로1.2.3.4가동

 Value : -0.5124796924110657 
 Index : 9 
 Name : 이화동

 Value : -0.518289390522176 
 Index : 14 
 Name : 숭인2동

 Value : -0.6048051236915724 
 Index : 32 
 Name : 후암동

 Value : -0.5088474635357202 
 Index : 48 
 Name : 왕십리2동

 Value : -0.5655985753744666 
 Index : 49 
 Name : 마장동

 Value : -0.5312737839370865 
 Index : 60 
 Name : 송정동

 Value : -0.9396856864314778 
 Index : 66 
 Name : 군자동

 Value : -0.9928173431359999 
 Index : 68 
 Name : 중곡2동

 Value : -0.8966397172394402 
 Index : 69 
 Name : 중곡3동

 Value : -0.9882634125500833 
 Index : 70 
 Name : 중곡4동

 Value : -0.7615987388013896 
 Index : 71 
 Name : 능동

 Value : -0.5304227886756585 
 Index : 74 
 Name : 구의3동

 Value : -0.9055463494755245 
 Index : 76 
 Name : 자양1동

 Value : -0.8176793922073684 
 Index : 82 
 Name : 휘경2동

 Value : -0.6787287083410702 
 Index : 83 
 Name : 청량리동

 Value : -1.089824833795522 
 Index : 84 
 Name : 용신동

 Value : -0.5031251781727177 
 Inde

### 분류

In [20]:
# 택배함이 5개 이상 설치된 곳은 한 군데 뿐이므로 분류의 정확도를 위해 4개 이상 택배함이 설치된 행정동은 4개 이상으로 통일한다.

three = 0
four = 0
over = 0

for i in y:
    if i == 3:
        three += 1
    elif i == 4:
        four += 1
    elif i > 4:
        over += 1

print('3 : {}, 4 : {}, 5+ : {}'.format(three, four, over))

y_trans = []

for i in y:
    if i > 4:
        y_trans.append(4)
    else:
        y_trans.append(i)

3 : 9, 4 : 5, 5+ : 1


In [21]:
# 로지스틱 회귀 패키지

from sklearn.linear_model import LogisticRegression

In [22]:
logis = LogisticRegression(max_iter = 100) # max_iter는 수렴을 위한 반복 수이다.
logis.fit(x_trans, y_trans) # 모델 적합
error_logis = y_trans - logis.predict(x_trans)

In [23]:
counter = 0
number = 0
log_reg = pd.DataFrame(columns = ['동', '오차'])
for i in error_logis:
    if i < 0:
        log_reg.at[number, '동'] = data.iloc[counter, 1]
        log_reg.at[number, '오차'] = i
        number += 1
    counter += 1
log_reg

Unnamed: 0,동,오차
0,휘경2동,-1
1,청량리동,-1
2,용신동,-1
3,중화2동,-1
4,송천동,-1
5,서강동,-1
6,공덕동,-1
7,화곡1동,-1
8,독산3동,-1
9,독산4동,-1


In [24]:
# 서포트 벡터 머신 패키지

from sklearn.svm import SVC

In [25]:
svm_lin = SVC(kernel = 'linear') # 선형 커널 객체 생성
svm_lin.fit(x_trans, y_trans) # 모델 적합
error_svm_lin = y_trans - svm_lin.predict(x_trans)

In [26]:
counter = 0
number = 0

for i in error_svm_lin:
    if i < 0:
        print(' Name : {} \n Error : {}'.format(data.iloc[counter, 1], i))
        print()
        number += 1
    counter += 1
    
print('The number of big error : {}'.format(number))

The number of big error : 0


In [27]:
svm_rbf = SVC(kernel = 'rbf') # Residual Basis Function 커널 객체 생성
svm_rbf.fit(x_trans, y_trans) # 모델 적합
error_svm_rbf = y_trans - svm_rbf.predict(x_trans)

In [28]:
counter = 0
number = 0

for i in error_svm_rbf:
    if i < 0:
        print(' Name : {} \n Error : {}'.format(data.iloc[counter, 1], i))
        print()
        number += 1
    counter += 1
    
print('The number of big error : {}'.format(number))

 Name : 휘경2동 
 Error : -1

 Name : 용신동 
 Error : -1

 Name : 흑석동 
 Error : -1

The number of big error : 3


In [29]:
# 랜덤 포레스트 패키지

from sklearn.ensemble import RandomForestClassifier

In [30]:
clf = RandomForestClassifier() # 매개 변수들에 변화를 줄 수 있다. 
clf.fit(x_trans, y_trans) # 모델 적합
error_rf = y_trans - clf.predict(x_trans)

In [31]:
counter = 0
number = 0

for i in error_rf:
    if i < 0:
        print(' Name : {} \n Error : {}'.format(data.iloc[counter, 1], i))
        print()
        number += 1
    counter += 1
    
print('The number of big error : {}'.format(number))

The number of big error : 0


In [32]:
result_full = pd.merge(log_reg, lin_reg, how = 'inner', on = '동')

In [33]:
final_full = pd.merge(data, result_full, how = 'inner', on = '동')

In [34]:
final_full.to_csv('final_full.csv', encoding = 'cp949')