# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style= "white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
# Importing all datasets
churn_data = pd.read_csv("../input/logisticregression-telecomcustomer-churmprediction/churn_data.csv")
customer_data = pd.read_csv("../input/logisticregression-telecomcustomer-churmprediction/customer_data.csv")
internet_data= pd.read_csv("../input/logisticregression-telecomcustomer-churmprediction/internet_data.csv")
print (churn_data.head())
print (churn_data.shape)
print (customer_data.head())
print (customer_data.shape)
print (internet_data.head())
print (internet_data.shape)

In [None]:
#Combining all data files into one consolidated dataframe
temp_df = pd.merge(churn_data, customer_data, how='inner', on='customerID')
init_df = pd.merge(temp_df, internet_data, how='inner', on='customerID')
print (init_df.head())
print (init_df.shape)

In [None]:
#checking balancing of data 
sns.countplot(x="Churn", data = init_df, palette="hls")
plt.show()

**The graph shows our data is imbalance**

**let us do data exploration**

In [None]:
init_df.groupby("Churn").mean()

In [None]:
%matplotlib inline
pd.crosstab(init_df.Contract,init_df.Churn).plot(kind="bar")
plt.title("frequency of churn vs Contract ")
plt.xlabel("Contract")
plt.ylabel("no of churn")

In [None]:
%matplotlib inline
pd.crosstab(init_df.PaymentMethod,init_df.Churn).plot(kind="bar")
plt.title("frequency of churn vs PaymentMethod ")
plt.xlabel("PaymentMethod")
plt.ylabel("no of churn")

In [None]:
%matplotlib inline
pd.crosstab(init_df.PhoneService,init_df.Churn).plot(kind="bar")
plt.title("frequency of PhoneService vs PaymentMethod ")
plt.xlabel("PhoneService")
plt.ylabel("no of churn")

In [None]:
%matplotlib inline
pd.crosstab(init_df.Contract,init_df.Churn).plot(kind="bar")
plt.title("frequency of Contract vs PaymentMethod ")
plt.xlabel("Contracte")
plt.ylabel("no of churn")

In [None]:
%matplotlib inline
pd.crosstab(init_df.PaperlessBilling,init_df.Churn).plot(kind="bar")
plt.title("frequency of PaperlessBilling vs PaymentMethod ")
plt.xlabel("PaperlessBilling")
plt.ylabel("no of churn")

In [None]:
%matplotlib inline
pd.crosstab(init_df.gender,init_df.Churn).plot(kind="bar")
plt.title("frequency of gender vs PaymentMethod ")
plt.xlabel("gender")
plt.ylabel("no of churn")

In [None]:
init_df = init_df[~np.isnan(init_df['TotalCharges'])]


In [None]:
#The varaible was imported as a string we need to convert it to float
init_df['TotalCharges'] = init_df['TotalCharges'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
init_df.columns

In [None]:
# Creating dummy variables for variables with only two values i.e. yes or no
cat_vars= ['PhoneService','Contract', 'PaperlessBilling','PaymentMethod','Churn','gender', 'Partner', 'Dependents','MultipleLines','InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies']
for var in cat_vars:    
    cat_list= pd.get_dummies(init_df[var], prefix=var)
    data1= init_df.join(cat_list)
    init_df=data1
    

   
 
data_vars=init_df.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars ]
data_final=init_df[to_keep]
data_final.rename( columns=({ 'Churn_Yes': 'Churn'}),inplace=True,)
del data_final['Churn_No']
del data_final['customerID']
data_final.columns.values

data_final=init_df[to_keep]
data_final.rename( columns=({ 'Churn_Yes': 'Churn'}),inplace=True,)

data_final.columns.values
#data_final.info()
    


In [None]:
del data_final['Churn_No']
del data_final['customerID']

In [None]:
data_final.info()

In [None]:
data_final.dropna(inplace=True)

In [None]:
#to balance our data we will use SMOTE( Synthetic Minority Oversampling Technique)
# we will use SMOTE on train data only thus will not affect final results

X= data_final.loc[:,data_final.columns !="Churn"]
Y= data_final.loc[:,data_final.columns =="Churn"]
from imblearn.over_sampling import SMOTE
os= SMOTE(random_state=42)
X_train, X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)
columns=X_train.columns

os_data_X,os_data_y=os.fit_resample(X_train,y_train)
os_data_X=pd.DataFrame(data=os_data_X,columns=columns)
columns= y_train.columns 
os_data_y=pd.DataFrame(data=os_data_y,columns=columns)

# we can check the number of our Data 
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['Churn']==0]))
print("Number of subscription",len(os_data_y[os_data_y['Churn']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['Churn']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['Churn']==1])/len(os_data_X))


In [None]:
os_data_X.head()

In [None]:
# apllying Scaling on data set as the LR weights the features 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
conti_vars = ['tenure', 'MonthlyCharges','TotalCharges']
os_data_X[conti_vars] = scaler.fit_transform(os_data_X[conti_vars])
X_test[conti_vars] = scaler.fit_transform(X_test[conti_vars])


In [None]:
os_data_X.head()

In [None]:
os_data_X.info()

In [None]:
# using RFE for data analysis
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression (solver='liblinear')


rfe= RFE(logreg, 10)
rfe= rfe.fit(os_data_X,os_data_y.values.ravel())
print (rfe.support_)
print(rfe.ranking_)
col_final= os_data_X.columns[rfe.support_]
os_data_X = os_data_X[col_final]


In [None]:
col_final

In [None]:
from statsmodels.stats.outliers_influence  import  variance_inflation_factor 
def get_VIF(X_train):
    # A dataframe that will contain the names of all the feature variables and their respective VIFs  
    vif = pd.DataFrame() 
    vif['Features'] = X_train.columns 
    vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(len (X_train.columns))] 
    #vif['VIF'] = (vif['VIF'], 2) 
    vif = vif.sort_values(by = "VIF", ascending = False) 
    print (vif)

In [None]:
# implimenting the model

import statsmodels.api as sm
logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary())
get_VIF(os_data_X)

In [None]:
# as Partner_No  has  p value above 0.05 we will drop it 
col_todel= ["Partner_No"]
col_final = [i for  i in col_final if i not in col_todel  ]
col_final

In [None]:
os_data_X = os_data_X[col_final]

In [None]:
# running again to check 
logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary())
get_VIF(os_data_X)

In [None]:
# as gender_Male  has  p value above 0.05 we will drop it 
col_todel= ["gender_Male"]
col_final = [i for  i in col_final if i not in col_todel  ]
col_final

In [None]:
os_data_X = os_data_X[col_final]

In [None]:
# running again to check 
logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary())
get_VIF(os_data_X)


In [None]:
# Logistic Regression Model Fitting
logreg.fit(os_data_X,os_data_y)

In [None]:
X_test= X_test[col_final]

In [None]:
y_pred =logreg.predict(X_test)
print ('Accuracy of Logistic Regression classifier on test set : {:.2f}'.format(logreg.score(X_test,y_test)))
print ('Accuracy of Logistic Regression classifier on train set : {:.2f}'.format(logreg.score(os_data_X,os_data_y)))

In [None]:
 #confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix= confusion_matrix(y_test, y_pred)
print(confusion_matrix)




In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
#ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc= roc_auc_score(y_test, y_pred)
fpr,tpr,thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr,tpr,label='Logistic Regression (area=%0.2f)' % logit_roc_auc)
plt.plot([0,1],[0,1],'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.title('Receiver operating characteristics')
plt.legend(loc="lower right")
plt.show()