# Modeling Notebook

In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import tree

In [None]:
# import dataset
df = pd.read_csv('..\\data\\telecom_data.csv')

# Handle object types for international plan and voice mail plan
df.loc[df['international plan'] == 'no', 'international plan'] = 0
df.loc[df['international plan'] == 'yes', 'international plan'] = 1 

df.loc[df['voice mail plan'] == 'no', 'voice mail plan'] = 0
df.loc[df['voice mail plan']== 'yes', 'voice mail plan'] = 1

# Change churn to values: 1 (churned/True) 0 (no churn/False)
df.loc[df['churn'] == True, 'churn'] = 1
df.loc[df['churn'] == False, 'churn'] = 0

# going to create backup df and drop phone number from original df
# phone number could be used as unique id, but it doesn't seem necessary
df_backup = df.copy()
df = df.drop(['phone number'], axis=1)

# casting int values to churn, voice mail plan, and international plan cols
objs = ['international plan', 'voice mail plan', 'churn']

for o in objs:
    df = df.astype({o: int})
    
# dropping area code
df = df.drop(['area code'], axis=1)

# check df
df.head()

Handle international calls to bin them into categories easier.

In [None]:
# handle international calls to bin them into categories easier.
df['total intl calls'].value_counts()

In [None]:
plt.boxplot(df['total intl calls'])

In [None]:
df['total intl calls'].describe()

In [None]:
# range is 0-20 for international calls with most concentrated from 0-10
# I will bin into cats: low, moderate, and high with values <3, 3-6, and >6

list_tmp = []

for index, row in df.iterrows():
    if row['total intl calls'] < 3:
        list_tmp.append('low')
    elif row['total intl calls'] > 6:
        list_tmp.append('high')
    else:
        list_tmp.append('moderate')

df['total_intl_calls'] = list_tmp

df['total_intl_calls'].describe()

What would be the % chance of guessing correctly if the customer was assumed to not churn?

In [None]:
1-df.churn.mean()

In [None]:
df.info()

Handle state and total_intl_calls object types. Turn these into integers for later modeling.

In [None]:
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA",
          "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY",
          "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX",
          "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [None]:
state_int = []
for i, row in df.iterrows():
    state_int.append(states.index(row['state']))

In [None]:
df['state_int'] = state_int

In [None]:
intl_calls_int = []
for i, row in df.iterrows():
    if row['total_intl_calls'] == 'low':
        intl_calls_int.append(0)
    elif row['total_intl_calls'] == 'moderate':
        intl_calls_int.append(1)
    else:
        intl_calls_int.append(2)

df['intl_calls_bins'] = intl_calls_int

In [None]:
cats = ['state_int', 'international plan', 'intl_calls_bins',
        'customer service calls', 'voice mail plan']

# Baseline Model

In [None]:
y = df['churn']
# dropping # vmail messages because its distribution isn't normal
X = df.drop(['churn', 'total intl calls', 'number vmail messages',
             'total_intl_calls', 'state'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Fixing class imbalance with SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train)

In [None]:
X_train_cats = X_train_resampled[cats]
X_train_cats

In [None]:
# handle categorical values
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

ohe.fit(X_train_cats)
X_train_ohe = pd.DataFrame(
    ohe.transform(X_train_cats),
    index=X_train_cats.index,
    columns=np.hstack(ohe.categories_)
)
X_train_ohe

In [None]:
X_train_numerics = X_train_resampled.drop(cats, axis=1)
X_train_numerics

In [None]:
# Scaling variables to work well with OHE data

scaler = MinMaxScaler()

scaler.fit(X_train_numerics)
X_train_scaled = pd.DataFrame(
    scaler.transform(X_train_numerics),
    index=X_train_numerics.index,
    columns=X_train_numerics.columns
)
X_train_scaled

In [None]:
X_train_full = pd.concat([X_train_scaled, X_train_ohe], axis=1)
X_train_full

In [None]:
# model
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear',
                            random_state=1)
model_log = logreg.fit(X_train_full, y_train_resampled)
model_log

In [None]:
# Model Evaluation
y_hat_train = logreg.predict(X_train_full)

train_residuals = np.abs(y_train_resampled - y_hat_train)
print(pd.Series(train_residuals, name="Residuals (counts)").value_counts())
print()
print(pd.Series(train_residuals, name="Residuals (proportions)").value_counts(normalize=True))

Train set is about 80% accurate.

In [None]:
# performance evaluation on test set.
X_test_cats = X_test[cats]

# handle categorical values
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

ohe.fit(X_test_cats)
X_test_ohe = pd.DataFrame(
    ohe.transform(X_test_cats),
    index=X_test_cats.index,
    columns=np.hstack(ohe.categories_)
)

X_test_numerics = X_test.drop(cats, axis=1)

# Scaling variables to work well with OHE data

scaler = MinMaxScaler()

scaler.fit(X_test_numerics)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_numerics),
    index=X_test_numerics.index,
    columns=X_test_numerics.columns
)

X_test_full = pd.concat([X_test_scaled, X_test_ohe], axis=1)
X_test_full

In [None]:
y_hat_test = logreg.predict(X_test_full)

test_residuals = np.abs(y_test - y_hat_test)
print(pd.Series(test_residuals, name="Residuals (counts)").value_counts())
print()
print(pd.Series(test_residuals, name="Residuals (proportions)").value_counts(normalize=True))

Test set is about 73% accurate.

# Model 2

In [None]:
# random forest model
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, max_depth=13, random_state=1)
forest.fit(X_train_full, y_train_resampled)

In [None]:
# Training Accuracy
forest.score(X_train_full, y_train_resampled)

In [None]:
# Testing Accuracy
forest.score(X_test_full, y_test)

In [None]:
def plot_feature_importances(model):
    n_features = X_train_full.shape[1]
    plt.figure(figsize=(8,16))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train_full.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

In [None]:
plot_feature_importances(forest)

# Model 3 - Optimize RF Model

In [None]:
# Using a pipeline to select for optimal parameters in the RF Classifier
pipe = Pipeline([('clf', RandomForestClassifier(random_state=1))])

param_range = np.arange(1, 17, 1)

grid_params = [{'clf__n_estimators': [100],
                'clf__criterion': ['gini', 'entropy'],
                'clf__max_depth': param_range,
                'clf__min_samples_split': param_range[1:],
                'clf__min_samples_leaf': param_range,}]

# grid search
gs = GridSearchCV(estimator=pipe,
                  param_grid=grid_params,
                  scoring='accuracy',
                  cv=10)

# Fit using grid search
gs.fit(X_train_full, y_train_resampled)

# Best accuracy
print('Best accuracy: %.3f' % gs.best_score_)

# Best params
print('\nBest params:\n', gs.best_params_)