# 1. Data Loading

In [9]:
import pandas as pd
from pathlib import Path

data_dir = Path('../data/')
data = pd.read_csv(data_dir / "churn.csv")
data.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [10]:
data.describe()

Unnamed: 0,Account Length,Area Code,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


# 2. Data Preprocessing & Feature Engineering

In [11]:
data.isna().sum()

State             0
Account Length    0
Area Code         0
Phone             0
Int'l Plan        0
VMail Plan        0
VMail Message     0
Day Mins          0
Day Calls         0
Day Charge        0
Eve Mins          0
Eve Calls         0
Eve Charge        0
Night Mins        0
Night Calls       0
Night Charge      0
Intl Mins         0
Intl Calls        0
Intl Charge       0
CustServ Calls    0
Churn?            0
dtype: int64

In [12]:
data.dtypes

State              object
Account Length      int64
Area Code           int64
Phone              object
Int'l Plan         object
VMail Plan         object
VMail Message       int64
Day Mins          float64
Day Calls           int64
Day Charge        float64
Eve Mins          float64
Eve Calls           int64
Eve Charge        float64
Night Mins        float64
Night Calls         int64
Night Charge      float64
Intl Mins         float64
Intl Calls          int64
Intl Charge       float64
CustServ Calls      int64
Churn?             object
dtype: object

In [13]:
numerical_columns = ['Day Mins', 'Day Calls',
       'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins',
       'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge']
numerical_columns

['Day Mins',
 'Day Calls',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge']

In [57]:
def data_preprocessing(data:pd.DataFrame, target_col:str, cols:list=[]) -> (pd.DataFrame, pd.DataFrame):
    from sklearn.preprocessing import StandardScaler
    import numpy as np
    # if cols empty, only include numeric features
    # drop duplicates, if any
    data.drop_duplicates(inplace=True)
    # drop null values, if any
    data.dropna(inplace=True, axis=0)
    target = data.loc[:, [target_col]]
    if not cols:
        cols = [col for col in data.columns if data.dtypes[col] != 'object']
    feats = data.loc[:, cols]
    # standardization of feature values
    stdscaler = StandardScaler()
    res = stdscaler.fit_transform(feats)
    joblib.dump(stdscaler, '../model/stdscaler.pkl')
    # construct back result (array) to data frame
    feats = pd.DataFrame(res, columns=feats.columns)
    # change string value to number
    target['Churn?'] = np.where(target['Churn?'] == 'True.', 1, 0)
    return feats, target

In [58]:
X, y = data_preprocessing(data, 'Churn?', numerical_columns)

# 3. Data Splitting

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [18]:
X_train.shape

(2666, 12)

In [19]:
X_test.shape

(667, 12)

In [20]:
y_train.value_counts()/y_train.shape[0]

Churn?
0         0.855214
1         0.144786
Name: count, dtype: float64

In [21]:
y_test.value_counts()/y_test.shape[0]

Churn?
0         0.854573
1         0.145427
Name: count, dtype: float64

# 4. Model Training

In [22]:
def build_model(X_train, y_train, neighbors:list[int]) -> list:
    from sklearn.neighbors import KNeighborsClassifier
    models = []
    for n in neighbors:
        knn = KNeighborsClassifier(n_neighbors=n)
        knn.fit(X_train, y_train['Churn?'])
        models.append(knn)
    return models

In [23]:
models = build_model(X_train, y_train, [3, 5, 7])

In [24]:
def get_best_score(models, best_model_name) -> any:
    import joblib
    score_data = {}
    for i, model in enumerate(models):
        score = model.score(X_test, y_test['Churn?'])
        score_data[score] = i
        print(f'Model {i+1} performance: {score}')
    highest_acc_model = models[score_data[max(score_data)]] 
    print(f"Highest accuracy: {max(score_data)}")
    print("Best model serialization success")
    joblib.dump(highest_acc_model, best_model_name)

In [25]:
get_best_score(models, data_dir / ".." / "model" / 'knn_clf2.pkl')

Model 1 performance: 0.8545727136431784
Model 2 performance: 0.8665667166416792


Model 3 performance: 0.8650674662668666
Highest accuracy: 0.8665667166416792
Best model serialization success


# 5. Evaluation

In [29]:
import joblib
knn_clf = joblib.load(data_dir / ".." / "model" / 'knn_clf2.pkl')

In [30]:
X_test

Unnamed: 0,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge
2794,1.436394,0.576314,1.436339,0.897713,0.847691,0.897086,-0.547243,-0.005505,-0.549026,0.237408,-0.601195,0.232756
2619,-0.409023,0.426808,-0.408543,-0.224437,-1.662395,-0.223558,0.516707,-1.232143,0.518857,0.309056,-1.007560,0.312367
720,-0.049121,-0.619735,-0.048856,-0.735222,0.395875,-0.736317,0.827191,-1.436583,0.826477,-1.016434,-1.007560,-1.014492
371,0.316290,0.476643,0.316232,0.430315,0.094665,0.430731,0.483088,-0.465494,0.483700,0.129936,0.617898,0.126607
57,0.341997,1.423515,0.342156,-2.478597,-1.160378,-2.478768,-0.389035,-1.181033,-0.390821,-1.589618,-0.601195,-1.585041
...,...,...,...,...,...,...,...,...,...,...,...,...
503,0.687210,1.872033,0.686721,-1.094152,0.546480,-1.093624,1.578680,-1.232143,1.577950,-0.335777,-0.601195,-0.337794
1411,-0.241926,-0.819076,-0.242201,-0.169217,-0.457554,-0.170194,-0.652056,0.761144,-0.650101,-0.908961,0.617898,-0.908343
2356,1.726520,-0.171217,1.726897,1.749679,0.295472,1.748590,1.349278,1.323353,1.349432,-1.589618,-0.601195,-1.585041
2325,1.676941,0.376972,1.677211,0.710359,1.098699,0.711472,0.771818,1.118913,0.773742,-0.156656,0.211534,-0.152033


In [31]:
y_test

Unnamed: 0,Churn?
2794,1
2619,1
720,0
371,0
57,1
...,...
503,0
1411,0
2356,1
2325,1


In [39]:
X_test.loc[[503], :]

Unnamed: 0,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge
503,0.68721,1.872033,0.686721,-1.094152,0.54648,-1.093624,1.57868,-1.232143,1.57795,-0.335777,-0.601195,-0.337794


In [37]:
res = knn_clf.predict(X_test.loc[[503], :])

In [40]:
pred = pd.DataFrame(res, columns=['Churn?'])

In [45]:
check = X_test.loc[[503], :]
check.loc[503, ['Churn?']] = res
check

Unnamed: 0,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,Churn?
503,0.68721,1.872033,0.686721,-1.094152,0.54648,-1.093624,1.57868,-1.232143,1.57795,-0.335777,-0.601195,-0.337794,0.0


Unnamed: 0,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge
2794,1.436394,0.576314,1.436339,0.897713,0.847691,0.897086,-0.547243,-0.005505,-0.549026,0.237408,-0.601195,0.232756
2619,-0.409023,0.426808,-0.408543,-0.224437,-1.662395,-0.223558,0.516707,-1.232143,0.518857,0.309056,-1.007560,0.312367
720,-0.049121,-0.619735,-0.048856,-0.735222,0.395875,-0.736317,0.827191,-1.436583,0.826477,-1.016434,-1.007560,-1.014492
371,0.316290,0.476643,0.316232,0.430315,0.094665,0.430731,0.483088,-0.465494,0.483700,0.129936,0.617898,0.126607
57,0.341997,1.423515,0.342156,-2.478597,-1.160378,-2.478768,-0.389035,-1.181033,-0.390821,-1.589618,-0.601195,-1.585041
...,...,...,...,...,...,...,...,...,...,...,...,...
503,0.687210,1.872033,0.686721,-1.094152,0.546480,-1.093624,1.578680,-1.232143,1.577950,-0.335777,-0.601195,-0.337794
1411,-0.241926,-0.819076,-0.242201,-0.169217,-0.457554,-0.170194,-0.652056,0.761144,-0.650101,-0.908961,0.617898,-0.908343
2356,1.726520,-0.171217,1.726897,1.749679,0.295472,1.748590,1.349278,1.323353,1.349432,-1.589618,-0.601195,-1.585041
2325,1.676941,0.376972,1.677211,0.710359,1.098699,0.711472,0.771818,1.118913,0.773742,-0.156656,0.211534,-0.152033


In [52]:
print(X_test.columns)
data.loc[[503], X_test.columns]

Index(['Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls',
       'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins',
       'Intl Calls', 'Intl Charge'],
      dtype='object')


Unnamed: 0,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge
503,217.2,138,36.92,145.5,111,12.37,280.7,76,12.63,9.3,3,2.51
