
![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)
# Lab | Imbalanced data

We will be using the `files_for_lab/customer_churn.csv` dataset to build a churn predictor.

# Instructions

1. Load the dataset and explore the variables.
2. We will try to predict variable `Churn` using a logistic regression on variables `tenure`, `SeniorCitizen`,`MonthlyCharges`.
3. Extract the target variable.
4. Extract the independent variables and scale them.
5. Build the logistic regression model.
6. Evaluate the model.
7. Even a simple model will give us more than 70% accuracy. Why?
8. **Synthetic Minority Oversampling TEchnique (SMOTE)** is an over sampling technique based on nearest neighbors that adds new points between existing points. Apply `imblearn.over_sampling.SMOTE` to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?


# Imports

In [141]:
#!pip install imblearn

In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.utils import resample
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE


# Load dataset, clean & explore

In [143]:
df = pd.read_csv('files_for_lab/customer_churn.csv')

In [144]:
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('.', '_', regex=True)

In [145]:
df.columns 

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [146]:
# df.isna().sum()

# X-y split

In [147]:
# X/y split
X = df.drop(['customerid', 'gender', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'totalcharges', 'churn'], axis = 1)
y = df['churn']

In [148]:
y.replace({'No': 0, 'Yes': 1}, inplace= True)

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [150]:
transformer = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(transformer.transform(X_train),columns=X.columns)
# because this is the only tranformation we apply to the numerics, 
# we can immediately transform the X_test as well
X_test_scaled = pd.DataFrame(transformer.transform(X_test),columns=X.columns)
X_train_scaled.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,-0.435476,-0.340191,-0.514314
1,2.296336,0.88321,0.01784
2,-0.435476,-1.196572,-0.819594
3,-0.435476,1.16867,-1.483535
4,-0.435476,-0.829552,0.658427


In [151]:
y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 

# Logistic Regression without adjustments regarding the imbalance of the data 

In [152]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_scaled, y_train)
LR.score(X_test_scaled, y_test)

0.7808063600227144

In [153]:
pred = LR.predict(X_test_scaled)

In [154]:
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [155]:
print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.6115942028985507
recall:  0.4557235421166307
f1:  0.5222772277227723


# Balance data using SMOTE

In [156]:
sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_scaled,y_train)

In [157]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_scaled)

In [158]:
print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.4776978417266187
recall:  0.7170626349892009
f1:  0.5734024179620035


# Balance data using Oversampling

In [159]:
train = pd.concat([X_train_scaled, y_train],axis=1)

In [160]:
train

Unnamed: 0,seniorcitizen,tenure,monthlycharges,churn
0,-0.435476,-0.340191,-0.514314,0
1,2.296336,0.883210,0.017840,0
2,-0.435476,-1.196572,-0.819594,1
3,-0.435476,1.168670,-1.483535,0
4,-0.435476,-0.829552,0.658427,1
...,...,...,...,...
5277,-0.435476,-0.707212,1.292341,0
5278,-0.435476,-0.911112,0.880297,0
5279,-0.435476,1.046330,-1.466853,0
5280,2.296336,-1.278132,0.167978,1


In [161]:
# separate majority/minority classes
not_churned = train[train['churn']==0]
churned = train[train['churn']==1]

In [162]:
# oversample minority
churned_oversampled = resample(churned, 
                                    replace=True,
                                    n_samples = len(not_churned),
                                    random_state=0)

In [163]:
train_oversampled = pd.concat([not_churned,churned_oversampled],axis=0)
train_oversampled.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges,churn
0,-0.435476,-0.340191,-0.514314,0
1,2.296336,0.88321,0.01784,0
3,-0.435476,1.16867,-1.483535,0
5,-0.435476,1.04633,0.503285,0
6,-0.435476,-0.258631,0.7018,0


In [164]:
y_train_over = train_oversampled['churn'].copy()
X_train_over = train_oversampled.drop('churn',axis = 1).copy()

In [165]:
# Our Logistic Regression, while still not amazing, has improved substantially!
# especially at detecting instances of diabetes
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_over, y_train_over)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.4790764790764791
recall:  0.7170626349892009
f1:  0.57439446366782
