# M.L (Machine Learning 연습)
- 라이브러리
- 데이터 전처리
- 모델링
- 하이퍼파라미터 튜닝

In [1]:
# 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import *

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform

import warnings
warnings.filterwarnings(action = 'ignore')

In [2]:
# 데이터 불러오기
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/mobile_cust_churn.csv'
data = pd.read_csv(path)
data = data.sample(5000, random_state = 2022)
data['CHURN'] = data['CHURN'].map({'LEAVE' : 1, 'STAY' : 0})
data.head()

Unnamed: 0,id,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURN
3178,3179,0,119512,51,31,248566,229,5,2,very_sat,very_high,considering,1
14926,14927,1,142144,192,15,774317,581,29,4,unsat,very_little,never_thought,1
15116,15117,1,142308,0,79,306426,497,1,1,sat,little,considering,0
12733,12734,1,113385,0,0,333599,819,1,6,very_unsat,very_high,considering,1
14032,14033,1,90348,209,10,637286,360,26,4,unsat,little,actively_looking_into_it,0


In [3]:
# 데이터 전처리
drop_cols = 'id'
data.drop(drop_cols, axis = 1, inplace = True)

In [4]:
target = 'CHURN'
x = data.drop(target, axis = 1)
y = data.loc[:, target]

In [5]:
# 결측치 확인
display(x.isna().sum())
display(y.isna().sum())

COLLEGE                        0
INCOME                         0
OVERAGE                        0
LEFTOVER                       0
HOUSE                          0
HANDSET_PRICE                  0
OVER_15MINS_CALLS_PER_MONTH    0
AVERAGE_CALL_DURATION          0
REPORTED_SATISFACTION          0
REPORTED_USAGE_LEVEL           0
CONSIDERING_CHANGE_OF_PLAN     0
dtype: int64

0

In [6]:
# 가변수화
dumm_cols = ['REPORTED_SATISFACTION','REPORTED_USAGE_LEVEL','CONSIDERING_CHANGE_OF_PLAN']
x = pd.get_dummies(x, columns = dumm_cols, drop_first = True)
x.head()

Unnamed: 0,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION_sat,REPORTED_SATISFACTION_unsat,REPORTED_SATISFACTION_very_sat,REPORTED_SATISFACTION_very_unsat,REPORTED_USAGE_LEVEL_high,REPORTED_USAGE_LEVEL_little,REPORTED_USAGE_LEVEL_very_high,REPORTED_USAGE_LEVEL_very_little,CONSIDERING_CHANGE_OF_PLAN_considering,CONSIDERING_CHANGE_OF_PLAN_never_thought,CONSIDERING_CHANGE_OF_PLAN_no,CONSIDERING_CHANGE_OF_PLAN_perhaps
3178,0,119512,51,31,248566,229,5,2,0,0,1,0,0,0,1,0,1,0,0,0
14926,1,142144,192,15,774317,581,29,4,0,1,0,0,0,0,0,1,0,1,0,0
15116,1,142308,0,79,306426,497,1,1,1,0,0,0,0,1,0,0,1,0,0,0
12733,1,113385,0,0,333599,819,1,6,0,0,0,1,0,0,1,0,1,0,0,0
14032,1,90348,209,10,637286,360,26,4,0,1,0,0,0,1,0,0,0,0,0,0


In [7]:
# 데이터 분할(train & validation)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 3, random_state = 20)

In [8]:
# Scaling
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

- Logistic Regression

In [9]:
# 값의 범위 설정
params = dict(
    C = uniform(loc=0, scale=4), 
    penalty=['l2', 'l1'])
params

{'C': <scipy.stats._distn_infrastructure.rv_frozen at 0x256509f1370>,
 'penalty': ['l2', 'l1']}

In [10]:
# 모델 선언
model = LogisticRegression()

model_rs = RandomizedSearchCV(model,
                             params,
                             random_state = 0)

In [11]:
# 모델 학습
model_rs.fit(x_train_s, y_train)

RandomizedSearchCV(estimator=LogisticRegression(),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000256509F1370>,
                                        'penalty': ['l2', 'l1']},
                   random_state=0)

In [12]:
# 하이퍼파라미터 튜닝 결과
model_rs.cv_results_

{'mean_fit_time': array([0.00098963, 0.00019937, 0.00010128, 0.01650004, 0.01630287,
        0.01336641, 0.01708083, 0.01260443, 0.00039902, 0.01658053]),
 'std_fit_time': array([2.04464749e-05, 3.98731232e-04, 2.02560425e-04, 5.59821958e-03,
        1.10366874e-03, 6.86796753e-03, 9.66090622e-04, 6.35049007e-03,
        4.88694643e-04, 4.66913004e-03]),
 'mean_score_time': array([0.        , 0.        , 0.        , 0.00039897, 0.00019937,
        0.        , 0.        , 0.00323157, 0.        , 0.        ]),
 'std_score_time': array([0.        , 0.        , 0.        , 0.00048864, 0.00039873,
        0.        , 0.        , 0.00597495, 0.        , 0.        ]),
 'param_C': masked_array(data=[2.195254015709299, 3.3770629943240693,
                    2.1795327319875875, 2.4942547871438894,
                    1.75034884505077, 0.22685190926977272,
                    1.5337660753031108, 3.2486749151019727,
                    2.2721782443757292, 3.34431505414951],
              mask=[Fa

In [13]:
model_rs.cv_results_['params']

[{'C': 2.195254015709299, 'penalty': 'l1'},
 {'C': 3.3770629943240693, 'penalty': 'l1'},
 {'C': 2.1795327319875875, 'penalty': 'l1'},
 {'C': 2.4942547871438894, 'penalty': 'l2'},
 {'C': 1.75034884505077, 'penalty': 'l2'},
 {'C': 0.22685190926977272, 'penalty': 'l2'},
 {'C': 1.5337660753031108, 'penalty': 'l2'},
 {'C': 3.2486749151019727, 'penalty': 'l2'},
 {'C': 2.2721782443757292, 'penalty': 'l1'},
 {'C': 3.34431505414951, 'penalty': 'l2'}]

In [14]:
model_rs.cv_results_['mean_test_score']

array([       nan,        nan,        nan, 0.63938178, 0.63898178,
       0.63898178, 0.63898178, 0.63938178,        nan, 0.63938178])

In [15]:
# 최적의 해
model_rs.best_params_

{'C': 2.4942547871438894, 'penalty': 'l2'}

In [16]:
# 그때의 성능
model_rs.best_score_

0.6393817817817818

In [17]:
pred = model_rs.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



- Decision Tree

In [18]:
# 값의 범위 설정
params = {'max_depth' : range(1, 21), 'min_samples_leaf' : range(10, 51, 5)}
params

{'max_depth': range(1, 21), 'min_samples_leaf': range(10, 51, 5)}

In [19]:
# 모델 선언
model = DecisionTreeClassifier()

model_rs = RandomizedSearchCV(model,
                             params,
                             cv = 5, 
                             n_iter = 5)

In [20]:
# 모델 학습
model_rs.fit(x_train_s, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=5,
                   param_distributions={'max_depth': range(1, 21),
                                        'min_samples_leaf': range(10, 51, 5)})

In [21]:
model_rs.cv_results_

{'mean_fit_time': array([0.01630168, 0.01032553, 0.01660604, 0.00340834, 0.0165926 ]),
 'std_fit_time': array([0.00394922, 0.00814431, 0.00129157, 0.00681667, 0.00137236]),
 'mean_score_time': array([0.00059843, 0.00306563, 0.        , 0.        , 0.        ]),
 'std_score_time': array([0.00048862, 0.00613127, 0.        , 0.        , 0.        ]),
 'param_min_samples_leaf': masked_array(data=[30, 50, 15, 45, 20],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[20, 12, 9, 1, 14],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'min_samples_leaf': 30, 'max_depth': 20},
  {'min_samples_leaf': 50, 'max_depth': 12},
  {'min_samples_leaf': 15, 'max_depth': 9},
  {'min_samples_leaf': 45, 'max_depth': 1},
  {'min_samples_leaf': 20, 'max_depth': 14}],
 'split0_test_score': array([0.68 , 0.691, 0.666, 0.63 , 0.66 ]),
 'split1_

In [22]:
model_rs.best_params_

{'min_samples_leaf': 50, 'max_depth': 12}

In [23]:
model_rs.best_score_

0.6908152152152152

In [24]:
pred = model_rs.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



- KNN

In [25]:
# 범위 설정
params = {'n_neighbors' : range(1, 101), 'metric' : ['euclidean', 'manhattan']}
params

{'n_neighbors': range(1, 101), 'metric': ['euclidean', 'manhattan']}

In [26]:
# 모델 선언
model = KNeighborsClassifier()

model_rs = RandomizedSearchCV(model,
                             params,
                             cv = 5, 
                             n_iter = 5)

In [27]:
# 모델 학습
model_rs.fit(x_train_s, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=5,
                   param_distributions={'metric': ['euclidean', 'manhattan'],
                                        'n_neighbors': range(1, 101)})

In [28]:
model_rs.cv_results_

{'mean_fit_time': array([0.00113354, 0.00080156, 0.00346289, 0.00189247, 0.00080943]),
 'std_fit_time': array([0.00177192, 0.00075249, 0.00304495, 0.00160723, 0.00099142]),
 'mean_score_time': array([0.12181015, 0.11520996, 0.11632266, 0.11829319, 0.13298578]),
 'std_score_time': array([0.00616074, 0.00590945, 0.01920343, 0.00947963, 0.01686338]),
 'param_n_neighbors': masked_array(data=[96, 23, 18, 43, 79],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_metric': masked_array(data=['euclidean', 'euclidean', 'euclidean', 'euclidean',
                    'manhattan'],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 96, 'metric': 'euclidean'},
  {'n_neighbors': 23, 'metric': 'euclidean'},
  {'n_neighbors': 18, 'metric': 'euclidean'},
  {'n_neighbors': 43, 'metric': 'euclidean'},
  {'n_neighbors': 79, 'metric': 'manhattan'}],
 'split0_

In [29]:
model_rs.best_params_

{'n_neighbors': 79, 'metric': 'manhattan'}

In [30]:
model_rs.best_score_

0.6253743743743744

In [31]:
pred = model_rs.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



- SVM

In [32]:
# 범위 설정
params = {'gamma' : [0.01, 0.03, 0.1, 0.5, 1.0]}

In [33]:
# 모델 선언
model = SVC()

model_rs = RandomizedSearchCV(model,
                             params,
                             cv = 5,
                             n_iter = 5)

In [34]:
# 모델 학습
model_rs.fit(x_train_s, y_train)

RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=5,
                   param_distributions={'gamma': [0.01, 0.03, 0.1, 0.5, 1.0]})

In [35]:
model_rs.cv_results_

{'mean_fit_time': array([0.70849271, 0.67377291, 0.67901125, 0.71727309, 0.76684389]),
 'std_fit_time': array([0.02481491, 0.0038993 , 0.01108853, 0.02394264, 0.00488724]),
 'mean_score_time': array([0.33702335, 0.33166409, 0.32705121, 0.32141252, 0.35078139]),
 'std_score_time': array([0.00740072, 0.03215538, 0.01839534, 0.00507055, 0.02673372]),
 'param_gamma': masked_array(data=[0.01, 0.03, 0.1, 0.5, 1.0],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'gamma': 0.01},
  {'gamma': 0.03},
  {'gamma': 0.1},
  {'gamma': 0.5},
  {'gamma': 1.0}],
 'split0_test_score': array([0.642, 0.649, 0.667, 0.646, 0.626]),
 'split1_test_score': array([0.637, 0.64 , 0.656, 0.653, 0.623]),
 'split2_test_score': array([0.63963964, 0.63963964, 0.66766767, 0.65465465, 0.62462462]),
 'split3_test_score': array([0.63263263, 0.64064064, 0.65565566, 0.63063063, 0.61161161]),
 'split4_test_score': array([0.61061061, 0.61361361, 0.6376376

In [36]:
model_rs.best_params_

{'gamma': 0.1}

In [37]:
model_rs.best_score_

0.6567921921921922

In [38]:
pred = model_rs.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

