In [23]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [24]:
from sklearn.metrics import mean_squared_error as mse

In [25]:
df = pd.read_csv("mobiles.csv")
df.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


### Q1.

In [26]:
stat_mean = df["sales"].mean()
stat_std  = df["sales"].std()
stat_out  = stat_mean + 2 * stat_std
stat_out

146.55150129273215

In [27]:
df_q1 = df.loc[df["sales"] > stat_out, ].reset_index(drop = True)
df_q1.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Medium,128,6,2,1,4000,4.6,122001,18999,0.09,231.79
1,Large,128,6,4,2,4500,4.5,267028,15999,0.2,427.22


In [28]:
len(df_q1)

16

In [29]:
df_q1["idx"] = (df_q1["ROM"] / 32) + (df_q1["RAM"] / 2) + \
(df_q1["num_front_camera"] + df_q1["num_rear_camera"]) + \
(df_q1["battery_capacity"] / 1000)

In [30]:
df_q1.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,idx
0,Medium,128,6,2,1,4000,4.6,122001,18999,0.09,231.79,14.0
1,Large,128,6,4,2,4500,4.5,267028,15999,0.2,427.22,17.5


In [31]:
round(df_q1["idx"].mean(), 2)

11.01

### Q2.

In [32]:
df_q2 = df.loc[df["num_rear_camera"] != 1, "battery_capacity":]
df_q2.head(2)

Unnamed: 0,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
1,2815,4.5,244,57149,0.04,1.39
4,2815,4.6,745,69149,0.02,5.15


In [33]:
df_q2.corr().abs().round(2)

Unnamed: 0,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
battery_capacity,1.0,0.42,0.03,0.5,0.26,0.03
ratings,0.42,1.0,0.19,0.15,0.12,0.23
num_of_ratings,0.03,0.19,1.0,0.26,0.21,0.95
sales_price,0.5,0.15,0.26,1.0,0.1,0.25
discount_percent,0.26,0.12,0.21,0.1,1.0,0.22
sales,0.03,0.23,0.95,0.25,0.22,1.0


In [34]:
df_q2.corr().abs().round(2)["sales"]

battery_capacity    0.03
ratings             0.23
num_of_ratings      0.95
sales_price         0.25
discount_percent    0.22
sales               1.00
Name: sales, dtype: float64

### Q3.

In [42]:
df_q3 = df.copy()

In [43]:
df_q3.head(1)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52


In [44]:
df_q3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430 entries, 0 to 429
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   screen_size       430 non-null    object 
 1   ROM               430 non-null    int64  
 2   RAM               430 non-null    int64  
 3   num_rear_camera   430 non-null    int64  
 4   num_front_camera  430 non-null    int64  
 5   battery_capacity  430 non-null    int64  
 6   ratings           430 non-null    float64
 7   num_of_ratings    430 non-null    int64  
 8   sales_price       430 non-null    int64  
 9   discount_percent  430 non-null    float64
 10  sales             430 non-null    float64
dtypes: float64(3), int64(7), object(1)
memory usage: 37.1+ KB


In [54]:
df_q3_dum = pd.get_dummies(df_q3, columns = ["screen_size"]) # 시험 버전
df_q3_dum.head(2)

# df_q3_dum = pd.get_dummies(df_q3, dtype = "int") # Pandas 2.0.0 이상
# df_q3_dum.head(2)

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,64,2,1,1,1800,4.5,38645,32999,0.17,127.52,0,0,0,0,1
1,64,4,2,1,2815,4.5,244,57149,0.04,1.39,0,0,1,0,0


In [55]:
df_q3_dum.shape

(430, 15)

In [17]:
#df_q3_dum = df_q3_dum.set_index("sales").reset_index()
#df_q3_dum.head(2)

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,127.52,64,2,1,1,1800,4.5,38645,32999,0.17,0,0,0,0,1
1,1.39,64,4,2,1,2815,4.5,244,57149,0.04,0,0,1,0,0


In [39]:
df_train, df_test = train_test_split(df_q3_dum, train_size = 0.8,
                                     random_state = 123)
len(df_train), len(df_test)

(344, 86)

In [56]:
df_train.head(2)

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
410,32,3,2,1,4000,4.3,1870,10449,0.05,1.95,0,0,1,0,0
21,128,3,1,1,2942,4.6,5366,47999,0.09,25.76,0,1,0,0,0


In [60]:
model_nor = MinMaxScaler().fit(df_train)

df_train_nor = df_train.copy()
df_test_nor = df_test.copy()
df_train_nor.loc[:,:] = model_nor.transform(df_train.loc[:,:])
df_test_nor.loc[:,:]  = model_nor.transform(df_test.loc[:,:])

In [62]:
df_train_nor.head(2)

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
410,0.047619,0.181818,0.333333,0.0,0.423077,0.625,0.003963,0.026073,0.093023,0.003948,0.0,0.0,1.0,0.0,0.0
21,0.238095,0.181818,0.0,0.0,0.219615,1.0,0.011387,0.273927,0.186047,0.052148,0.0,1.0,0.0,0.0,0.0


In [63]:
ls_k = [3, 5, 7, 9, 11]
k = ls_k[0]

model_knn = KNeighborsRegressor(n_neighbors = k)
model_knn.fit(X = df_train_nor.drop(columns='sales'), y = df_train_nor['sales'])
pred = model_knn.predict(df_test_nor.drop(columns='sales'))
val_rmse = mean_squared_error(y_true = df_test_nor['sales'],y_pred = pred) ** 0.5
val_rmse

0.08186677375964535

In [None]:
# np.sqrt()
# ** 0.5

In [64]:
ls_k = [3, 5, 7, 9, 11]

ls_rmse = []
for k in ls_k:
    model_knn = KNeighborsRegressor(n_neighbors = k)
    model_knn.fit(X = df_train_nor.drop(columns='sales'), y = df_train_nor['sales'])
    pred = model_knn.predict(df_test_nor.drop(columns='sales'))
    val_rmse = mean_squared_error(y_true = df_test_nor['sales'],
                                  y_pred = pred) ** 0.5
    ls_rmse = ls_rmse + [val_rmse]

ls_rmse

[0.08186677375964535,
 0.09879109824384892,
 0.107669855645971,
 0.11232111394853059,
 0.1136902366621185]

In [38]:
from tqdm.notebook import tqdm
import time

ls_k = [3, 5, 7, 9, 11]

ls_rmse = []
for k in tqdm(ls_k):
    time.sleep(1)
    model_knn = KNeighborsRegressor(n_neighbors = k)
    model_knn.fit(X = arr_train_nor[:, 1:],
                  y = arr_train_nor[:,  0])
    pred = model_knn.predict(arr_test_nor[:, 1:])
    val_rmse = mean_squared_error(y_true = arr_test_nor[:, 0],
                                  y_pred = pred) ** 0.5
    ls_rmse = ls_rmse + [val_rmse]

ls_rmse

  0%|          | 0/5 [00:00<?, ?it/s]

[0.08186677375964535,
 0.09879109824384892,
 0.107669855645971,
 0.11232111394853059,
 0.1136902366621185]

In [52]:
k

11

In [65]:
# pd.Series(ls_rmse, index = ls_k)
best_k =  pd.Series(ls_rmse, index = ls_k).idxmin()
best_k

3

### Q3. 추가 지시사항
다음은 저번달에 신규 출시된 경쟁사의 스마트폰 정보이다. 해당 스마트폰의 판매지수는 얼마로 예상되는가?  
※ 정규화 되지 않은 값으로 반올림하여 소수점 첫째 자리까지 출력하시오  
※ KNN 모델을 사용하며 이웃 개수는 직전에 최적이라고 판단한 k값을 사용하시오.  
* ROM: 256
* RAM: 6
* num_rear_camera: 4
* num_front_camera: 1
* battery_capacity: 4000
* ratings: 4.3
* num_of_ratings: 25000
* sales_price: 85000
* discount_percent: 0.05
* screen_size: "Large"

In [67]:
# df_t1 = pd.DataFrame(dict(ROM = 256, ~~~))
df_t1 = df_test.head(1).reset_index(drop = True)
df_t1

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,256,4,2,1,2815,4.6,745,79149,0.02,5.9,0,1,0,0,0


In [68]:
df_t1["RAM"] = 6
df_t1["num_rear_camera"] = 4
df_t1["battery_capacity"] = 4000
df_t1["ratings"] = 4.3
df_t1["num_of_ratings"] = 25000
df_t1["sales_price"] = 85000
df_t1["discount_percent"] = 0.05
df_t1["screen_size_Large"] = 1
df_t1["screen_size_Medium"] = 0
# df_t1.transpose()
df_t1

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,256,6,4,1,4000,4.3,25000,85000,0.05,5.9,1,0,0,0,0


In [51]:
arr_t1_nor = model_nor.transform(df_t1)
arr_t1_nor

array([[0.0119438 , 0.49206349, 0.45454545, 1.        , 0.        ,
        0.42307692, 0.625     , 0.05308122, 0.51815842, 0.09302326,
        1.        , 0.        , 0.        , 0.        , 0.        ]])

In [54]:
model_knn_best = KNeighborsRegressor(n_neighbors = best_k)
model_knn_best.fit(X = arr_train_nor[:, 1:],
                   y = arr_train_nor[:,  0])
pred_t1 = model_knn_best.predict(arr_t1_nor[:, 1:])
pred_t1

array([0.00132259])

In [17]:
df_corr_melt = df_q2.corr().reset_index().melt(id_vars = "index")
df_corr_melt.head(2)

Unnamed: 0,index,variable,value
0,battery_capacity,battery_capacity,1.0
1,ratings,battery_capacity,-0.424129


In [None]:
arr_t1_nor[0, 0] = pred_t1
arr_t1_inv = model_nor.inverse_transform(arr_t1_nor)
arr_t1_inv

In [57]:
df_t1_inv = pd.DataFrame(arr_t1_inv, columns = df_t1.columns)
df_t1_inv

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,0.653333,256.0,6.0,4.0,1.0,4000.0,4.3,25000.0,85000.0,0.05,1.0,0.0,0.0,0.0,0.0


In [58]:
df_t1

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,5.9,256,6,4,1,4000,4.3,25000,85000,0.05,1,0,0,0,0


In [59]:
df_t1_inv["sales"]

0    0.653333
Name: sales, dtype: float64