<a href="https://colab.research.google.com/github/sabinamunoz/lab-cross-validation/blob/master/lab_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import tree

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

In [None]:
churn_data = pd.read_csv('files_for_lab/Customer-Churn.csv')
churn_data.columns = churn_data.columns.str.lower().str.replace(' ', '_')
churn_data

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [None]:
churn_data.loc[churn_data['totalcharges'] == ' ', 'totalcharges'] = np.nan
churn_data['totalcharges'] = churn_data['totalcharges'].astype('float')

In [None]:
churn_data['totalcharges'].fillna(churn_data['totalcharges'].mean(), inplace=True)

In [None]:
df = churn_data.copy()

In [None]:
cols_scale = df[['tenure', 'seniorcitizen', 'monthlycharges', 'totalcharges']].copy()

In [None]:
scaler = StandardScaler()
cols_scale = scaler.fit_transform(cols_scale)

In [None]:
df.update(cols_scale)

In [None]:
y = df['churn']
X = df.drop('churn', axis=1)

In [None]:
X = pd.get_dummies(X, columns=X.select_dtypes('object').columns.tolist())
X

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,...,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,0,1,29.85,29.85,1,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
1,0,34,56.95,1889.50,0,1,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
2,0,2,53.85,108.15,0,1,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
3,0,45,42.30,1840.75,0,1,1,0,1,0,...,1,1,0,0,1,0,0,0,1,0
4,0,2,70.70,151.65,1,0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,0,1,0,1,0,1,...,1,0,0,1,0,0,1,0,1,0
7039,0,72,103.20,7362.90,1,0,0,1,0,1,...,0,0,0,1,0,0,1,0,1,0
7040,0,11,29.60,346.45,1,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
7041,1,4,74.40,306.60,0,1,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 1. Apply SMOTE for upsampling

In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
y_train_smote.value_counts()

No     3635
Yes    3635
Name: churn, dtype: int64

### a. Use logistic regression to fit the model

In [None]:
classification = LogisticRegression(random_state=42, max_iter=1000)
classification.fit(X_train_smote, y_train_smote)
predictions = classification.predict(X_test)
classification.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7917652626597255

### b. Use decision tree classifier to fit the model

In [None]:
classification_tree = tree.DecisionTreeClassifier()
classification_tree = classification_tree.fit(X_train, y_train)
classification_tree.score(X_test, y_test)

0.7188831045906294

### c. Compare the accuracies of the two models

* Logistic regressor presents a slightly higher score vs. decision tree classifier - .79 vs .71

## 2. Apply TomekLinks for downsampling

In [None]:
tomek = TomekLinks()
X_train_tomek, y_train_tomek = tomek.fit_resample(X_train, y_train)
y_train_tomek.value_counts()

No     3210
Yes    1295
Name: churn, dtype: int64

### a. Use logistic regression to fit the model

In [None]:
classification = LogisticRegression(random_state=42, max_iter=100000)
classification.fit(X_train_tomek, y_train_tomek)
predictions = classification.predict(X_test)
classification.score(X_test, y_test)

0.795551348793185

### b. Use decision tree classifier to fit the model

In [None]:
classification_tree = tree.DecisionTreeClassifier()
classification_tree = classification_tree.fit(X_train_tomek, y_train_tomek)
classification_tree.score(X_test, y_test)

0.722669190724089

### c. Compare the accuracies of the two models

* Logistic regressor presents a slightly higher score vs. decision tree classifier - .79 vs .72

### d. apply this algorithm one more time and check for changes

In [None]:
tomek = TomekLinks()
X_train_tomek, y_train_tomek = tomek.fit_resample(X_train_tomek, y_train_tomek)
y_train_tomek.value_counts()

No     3082
Yes    1295
Name: churn, dtype: int64

In [None]:
classification = LogisticRegression(random_state=42, max_iter=100000)
classification.fit(X_train_tomek, y_train_tomek)
predictions = classification.predict(X_test)
classification.score(X_test, y_test)

0.795551348793185

In [None]:
classification_tree = tree.DecisionTreeClassifier()
classification_tree = classification_tree.fit(X_train_tomek, y_train_tomek)
classification_tree.score(X_test, y_test)

0.735447231424515

#### After second pass of tomeklinks
* Logistic regressor shows no changes - .79
* decision tree classifier - slight improvement  from .72 to .73