# Challenge 2 - Logistic Regression

# Before your start:

    Read the README.md file
    Comment as much as you can and use the resources (README.md file)
    Happy learning!

In [178]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

churn = pd.read_csv('Customer-Churn.csv')
churn['TotalCharges'] = churn['TotalCharges'].str.replace(' ', '0', regex = True).astype('float64')
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Data Pre-processing (Handling Numerical variables)

In [179]:
nums = churn[['MonthlyCharges', 'TotalCharges', 'tenure']]
nums.head()

Unnamed: 0,MonthlyCharges,TotalCharges,tenure
0,29.85,29.85,1
1,56.95,1889.5,34
2,53.85,108.15,2
3,42.3,1840.75,45
4,70.7,151.65,2


### MinMax Scaler

Hint: Since we are using "numerics" to store the nummerical variables we can pass "numerics" directly
as MinMaxScaler().fit(numerics)

In [180]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()

nums_minmaxscaled = pd.DataFrame(minmax.fit_transform(nums), columns = nums.columns)
nums_minmaxscaled.head()

Unnamed: 0,MonthlyCharges,TotalCharges,tenure
0,0.115423,0.003437,0.013889
1,0.385075,0.217564,0.472222
2,0.354229,0.012453,0.027778
3,0.239303,0.211951,0.625
4,0.521891,0.017462,0.027778


### Data Pre-processing (Handling Categorical variables)

In [181]:
cats = churn[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Contract']].astype('str')
cats.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,Contract
0,Female,0,Yes,No,Month-to-month
1,Male,0,No,No,One year
2,Male,0,No,No,Month-to-month
3,Male,0,No,No,One year
4,Female,0,No,No,Month-to-month


### Using One Hot Encoding 

In [182]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder()

cats_onehotcols = []

for i in cats.columns:
    cats_onehotcols = cats_onehotcols + list(i + '-' + np.sort(cats[i].unique()))

cats_onehotenc = pd.DataFrame(onehot.fit_transform(cats).toarray(), columns = cats_onehotcols)
cats_onehotenc.head()

Unnamed: 0,gender-Female,gender-Male,SeniorCitizen-0,SeniorCitizen-1,Partner-No,Partner-Yes,Dependents-No,Dependents-Yes,Contract-Month-to-month,Contract-One year,Contract-Two year
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [185]:
X = pd.concat([nums_minmaxscaled, cats_onehotenc], axis=1)
X.head()

Unnamed: 0,MonthlyCharges,TotalCharges,tenure,gender-Female,gender-Male,SeniorCitizen-0,SeniorCitizen-1,Partner-No,Partner-Yes,Dependents-No,Dependents-Yes,Contract-Month-to-month,Contract-One year,Contract-Two year
0,0.115423,0.003437,0.013889,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,0.385075,0.217564,0.472222,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.354229,0.012453,0.027778,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.239303,0.211951,0.625,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.521891,0.017462,0.027778,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [69]:
Y = churn[['Churn']]
Y.head()

Unnamed: 0,Churn
0,No
1,No
2,Yes
3,No
4,Yes


In [186]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, roc_curve

lm = LogisticRegression(solver = 'lbfgs', multi_class = 'ovr')
model = lm.fit(X, Y)

print(f'The regression model intercept is {model.intercept_}')
print(f'The regression model coefficients are {model.coef_}, respectively')

The regression model intercept is [-1.81482147]
The regression model coefficients are [[ 2.47038715  0.88499942 -3.1381904   0.00780939 -0.00746579 -0.20702189
   0.20736549 -0.00961751  0.00996111  0.12646418 -0.12612058  0.97473729
  -0.03080935 -0.94358434]], respectively


In [187]:
predictions = model.predict(X)
predictions

array(['No', 'No', 'No', ..., 'No', 'Yes', 'No'], dtype=object)

In [211]:
accuracy = accuracy_score(Y, predictions)
precision = precision_score(Y, predictions, pos_label = 'Yes')
confusion = confusion_matrix(Y, predictions, labels = ['Yes', 'No'])

print(f'Measures of accuracy:')
print(f'accuracy = {round(accuracy, 4)}')
print(f'precision = {round(precision, 4)}')
print(f'confusion = {confusion}')

Measures of accuracy:
accuracy = 0.7906
precision = 0.6359
confusion = [[ 922  947]
 [ 528 4646]]
