# 20기 KNN 정규세션 과제

**데이터:** [blackfriday | Kaggle](https://www.kaggle.com/llopesolivei/blackfriday)

---

## 0. 데이터 불러오기

In [57]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

df = pd.read_csv("blackfriday.csv", index_col = 0)
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1001088,P00046042,F,0-17,10,A,3,0,5,17.0,,2010
1,1004493,P00347742,F,0-17,10,A,1,0,7,,,4483
2,1005302,P00048942,F,0-17,10,A,1,0,1,4.0,,7696
3,1001348,P00145242,F,0-17,10,A,3,0,2,4.0,,16429
4,1001348,P00106742,F,0-17,10,A,3,0,3,5.0,,5780


## 1. Preprocssing / EDA

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 0 to 4997
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     4998 non-null   int64  
 1   Product_ID                  4998 non-null   object 
 2   Gender                      4998 non-null   object 
 3   Age                         4998 non-null   object 
 4   Occupation                  4998 non-null   int64  
 5   City_Category               4998 non-null   object 
 6   Stay_In_Current_City_Years  4998 non-null   object 
 7   Marital_Status              4998 non-null   int64  
 8   Product_Category_1          4998 non-null   int64  
 9   Product_Category_2          3465 non-null   float64
 10  Product_Category_3          1544 non-null   float64
 11  Purchase                    4998 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 507.6+ KB


In [39]:
df.head(10)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1001088,P00046042,F,0-17,10,A,3,0,5,17.0,,2010
1,1004493,P00347742,F,0-17,10,A,1,0,7,,,4483
2,1005302,P00048942,F,0-17,10,A,1,0,1,4.0,,7696
3,1001348,P00145242,F,0-17,10,A,3,0,2,4.0,,16429
4,1001348,P00106742,F,0-17,10,A,3,0,3,5.0,,5780
5,1005302,P00051642,F,0-17,10,A,1,0,8,,,9821
6,1001348,P00000242,F,0-17,10,A,3,0,2,4.0,9.0,12707
7,1001088,P00117942,F,0-17,10,A,3,0,5,15.0,,7108
8,1001088,P00200142,F,0-17,10,A,3,0,16,,,16521
9,1005302,P00319342,F,0-17,10,A,1,0,5,8.0,,1886


In [40]:
df.Stay_In_Current_City_Years.value_counts()

1     1758
2      915
3      908
4+     742
0      675
Name: Stay_In_Current_City_Years, dtype: int64

Stay_In_Current_City_Years 를 수치형으로 바꾸려 했으나 '4+' 4년 이상이 있어서 그대로 범주형으로 남겨두고자 한다.

marital_status 는 범주형으로 바꿔준다.

In [41]:
df = df.astype({'Marital_Status':'object'})

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4998 entries, 0 to 4997
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     4998 non-null   int64  
 1   Product_ID                  4998 non-null   object 
 2   Gender                      4998 non-null   object 
 3   Age                         4998 non-null   object 
 4   Occupation                  4998 non-null   int64  
 5   City_Category               4998 non-null   object 
 6   Stay_In_Current_City_Years  4998 non-null   object 
 7   Marital_Status              4998 non-null   object 
 8   Product_Category_1          4998 non-null   int64  
 9   Product_Category_2          3465 non-null   float64
 10  Product_Category_3          1544 non-null   float64
 11  Purchase                    4998 non-null   int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 507.6+ KB


In [43]:
df.isnull().sum()

User_ID                          0
Product_ID                       0
Gender                           0
Age                              0
Occupation                       0
City_Category                    0
Stay_In_Current_City_Years       0
Marital_Status                   0
Product_Category_1               0
Product_Category_2            1533
Product_Category_3            3454
Purchase                         0
dtype: int64

결측치를 채우기 이전에 값의 분포를 확인해보고자 한다.

In [44]:
df.Product_Category_2.value_counts(ascending=True)

7.0       7
18.0     23
10.0     30
3.0      35
12.0     40
9.0      65
13.0    102
11.0    110
17.0    132
6.0     148
4.0     246
5.0     258
15.0    345
16.0    392
2.0     448
14.0    496
8.0     588
Name: Product_Category_2, dtype: int64

In [45]:
df.Product_Category_3.value_counts(ascending=True)

3.0       1
4.0      17
11.0     22
10.0     23
18.0     44
6.0      44
13.0     50
12.0     96
9.0     105
8.0     116
5.0     140
17.0    146
14.0    162
15.0    260
16.0    318
Name: Product_Category_3, dtype: int64

In [46]:
# 뚜렷한 분포가 없어 최빈값으로 결측치를 채운다.

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
df[['Product_Category_2','Product_Category_3']] = imputer.fit_transform(df[['Product_Category_2','Product_Category_3']])

In [47]:
df.isnull().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

## 2. KNN 구현 & 파라미터 튜닝

In [48]:
df.drop(['User_ID','Product_ID'],axis=1, inplace = True)


In [51]:
df = pd.get_dummies(df,drop_first = True)

In [36]:
df.head(10)

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Product_ID_P00000242,Product_ID_P00000342,Product_ID_P00000442,...,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,1001088,10,0,5,17.0,,2010,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1004493,10,0,7,,,4483,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1005302,10,0,1,4.0,,7696,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1001348,10,0,2,4.0,,16429,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1001348,10,0,3,5.0,,5780,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,1005302,10,0,8,,,9821,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,1001348,10,0,2,4.0,9.0,12707,1,0,0,...,0,0,0,0,0,0,0,0,1,0
7,1001088,10,0,5,15.0,,7108,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,1001088,10,0,16,,,16521,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,1005302,10,0,5,8.0,,1886,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [52]:
X = df.drop(['Purchase'],axis=1)
y = df['Purchase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

scaler = StandardScaler()
scaler.fit(X_train) 

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [53]:
regressor = KNeighborsRegressor(n_neighbors = 5, weights = "distance")
regressor.fit(X_train , y_train)

y_pred = regressor.predict(X_test)

In [54]:
from math import sqrt
from sklearn.metrics import mean_squared_error 

sqrt(mean_squared_error(y_test,y_pred))  

4875.776601832297

In [60]:
# 그리드 서치 적용

grid = {"n_neighbors": [17, 19, 23, 25, 27, 29, 31, 33, 39],
            "weights":['uniform', 'distance'],
            'metric' : ['euclidean', 'manhattan', 'minkowski']}

gscv_knn = GridSearchCV(estimator = regressor, param_grid=grid, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1,verbose=2)
gscv_knn.fit(X_train, y_train)

#best f1_macro 수치와 best parameter확인
print(f"RMSE : {gscv_knn.best_score_}")
print(f"param : {gscv_knn.best_params_}")

#랭킹 뽑기

result = pd.DataFrame(gscv_knn.cv_results_)
result.sort_values(by=['rank_test_score'],inplace=True)

#plot
result[['params','mean_test_score','rank_test_score']].head(10)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.0s


RMSE : -4611.14732838667
param : {'metric': 'manhattan', 'n_neighbors': 27, 'weights': 'distance'}


[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:    3.5s finished


Unnamed: 0,params,mean_test_score,rank_test_score
27,"{'metric': 'manhattan', 'n_neighbors': 27, 'we...",-4611.147328,1
25,"{'metric': 'manhattan', 'n_neighbors': 25, 'we...",-4611.521297,2
31,"{'metric': 'manhattan', 'n_neighbors': 31, 'we...",-4611.617953,3
29,"{'metric': 'manhattan', 'n_neighbors': 29, 'we...",-4611.748877,4
35,"{'metric': 'manhattan', 'n_neighbors': 39, 'we...",-4612.140029,5
23,"{'metric': 'manhattan', 'n_neighbors': 23, 'we...",-4612.342607,6
33,"{'metric': 'manhattan', 'n_neighbors': 33, 'we...",-4613.031617,7
21,"{'metric': 'manhattan', 'n_neighbors': 19, 'we...",-4619.720684,8
19,"{'metric': 'manhattan', 'n_neighbors': 17, 'we...",-4627.650435,9
22,"{'metric': 'manhattan', 'n_neighbors': 23, 'we...",-4659.403436,10


## 3. Evaluation

In [61]:
# 최종모델
regressor = KNeighborsRegressor(n_neighbors = 27, weights = "distance", metric="manhattan")
regressor.fit(X_train , y_train)

y_pred = regressor.predict(X_test)
sqrt(mean_squared_error(y_test,y_pred))

4476.55585652514