In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing libraries
import pandas as pd #data processing
import matplotlib.pyplot as plt
import seaborn as sns

#load the data 
mydataset = pd.read_csv("../input/telecocustomerchurn/Telco-Customer-Churn.csv")
mydataset.head()

#get the number of rows and cols
mydataset.shape

#datatypes of column 
print(mydataset.dtypes)

#get count of empty cells in cols
print(mydataset.isna().sum())

#check for any missing null values
print (mydataset.isnull().values.any())

#view satistics of my dataset
print(mydataset.describe())

#converting TotalCharges into float type
mydataset['TotalCharges'] = pd.to_numeric(mydataset['TotalCharges'], errors='coerce')
mydataset['TotalCharges'] = mydataset['TotalCharges'].fillna(mydataset['TotalCharges'].median())

# Convert SeniorCitizen from integer to string
mydataset['SeniorCitizen'] = mydataset['SeniorCitizen'].apply(lambda x: 'Yes' if x==1 else 'No')

#count of churn
print (mydataset['Churn'].value_counts())

#visualzing the churn rate 
sns.countplot(mydataset['Churn'])


#print all values of data types and their unique values 
for column in mydataset.columns:
    if mydataset[column].dtypes==object:
        print(str(column)+ ' : '+ str(mydataset[column].unique()))
        print (mydataset[column].value_counts())
        print('------------------------------')
        

# Exploratory analysis on non-continuous features
plt.figure(figsize=(15, 18))

plt.subplot(4, 2, 1)
sns.countplot('gender', data=mydataset, hue='Churn')

plt.subplot(4, 2, 2)
sns.countplot('SeniorCitizen', data=mydataset, hue='Churn')

plt.subplot(4, 2, 3)
sns.countplot('Partner', data=mydataset, hue='Churn')

plt.subplot(4, 2, 4)
sns.countplot('Dependents', data=mydataset, hue='Churn')

plt.subplot(4, 2, 5)
sns.countplot('PhoneService', data=mydataset, hue='Churn')

plt.subplot(4, 2, 6)
sns.countplot('PaperlessBilling', data=mydataset, hue='Churn')
    
plt.subplot(4, 2, 7)
sns.countplot('StreamingMovies', data=mydataset, hue='Churn')

plt.subplot(4, 2, 8)
sns.countplot('StreamingTV', data=mydataset, hue='Churn')

plt.figure(figsize=(15, 18))

plt.subplot(4, 2, 1)
sns.countplot('InternetService', data=mydataset, hue='Churn')

plt.subplot(4, 2, 2)
sns.countplot('DeviceProtection', data=mydataset, hue='Churn')

plt.subplot(4, 2, 3)
sns.countplot('TechSupport', data=mydataset, hue='Churn')

plt.subplot(4, 2, 4)
sns.countplot('OnlineSecurity', data=mydataset, hue='Churn')

plt.subplot(4, 2, 5)
sns.countplot('OnlineBackup', data=mydataset, hue='Churn')

plt.subplot(4, 2, 6)
sns.countplot('MultipleLines', data=mydataset, hue='Churn')

plt.subplot(4, 2, 7)
g = sns.countplot('PaymentMethod', data=mydataset, hue='Churn')
g.set_xticklabels(g.get_xticklabels(), rotation=45);

plt.subplot(4, 2, 8)
g = sns.countplot('Contract', data=mydataset, hue='Churn')
g.set_xticklabels(g.get_xticklabels(), rotation=45);


"""It seems that the gender column doesn't have a big effect on the Chur rate.

Churn: 50.73% Males, 49.26% Females
Not Churn: 50.24% Males, 49.75% Females

We can drop the varibale 'gender' as it doesn't effect on churning'"""

#remove some useless columns
mydataset = mydataset.drop('customerID', axis = 1)
mydataset = mydataset.drop('gender', axis =1)

#get correlation between data
mydataset.corr()

#visualization of correlated data
plt.figure(figsize=(10,10))
sns.heatmap(mydataset.corr(), annot=True, cmap='coolwarm')
#Due ToatlCharges highly correlated with MonthlyChrage and tenure, remove TotalCharge
mydataset = mydataset.drop('TotalCharges', axis =1)


# Create dummy variables for features with more than two classes
dummy_data = pd.get_dummies(mydataset,drop_first=True)
print(dummy_data.head())
print(dummy_data.dtypes)

#selecting features 
X = dummy_data.iloc[:, 0:28].values
Y = dummy_data.iloc[:, -1].values


"""#transform data
from sklearn.preprocessing import LabelEncoder
import numpy as np
for column in mydataset.columns:
    if mydataset[column].dtypes==np.number:
        continue
    mydataset[column]= LabelEncoder().fit_transform(mydataset[column])

mydataset.head()
Y_stat = mydataset.iloc[:,-1]
X_stat = mydataset.iloc[:,0:17]"""

import statsmodels.api as sm
logit_model = sm.Logit(Y,X)
result = logit_model.fit()
print(result.summary())



#split the data into 75% training and 25% testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state = 0)

#preprocessing
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


#logistic regression
from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression(random_state = 0)
logistic_reg.fit(X_train, Y_train)
y_pred = logistic_reg.predict(X_test)
acc_lg = accuracy_score(Y_test, y_pred)
print("Accuracy: {}".format(acc_lg))
print()
print(classification_report(Y_test,y_pred))
cnf_matrix = metrics.confusion_matrix(Y_test,y_pred)
print(cnf_matrix)

#Decision Trees
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
y_pred = decision_tree.predict(X_test)
acc_dt = accuracy_score(Y_test, y_pred)
print("Accuracy: {}".format(acc_dt))
#print(tree.plot_tree(decision_tree))
print()
print(classification_report(Y_test,y_pred))
cnf_matrix = metrics.confusion_matrix(Y_test,y_pred)
print(cnf_matrix)

"""
from sklearn import tree
from IPython.display import Image
import pydotplus

features =pd.concat( [dummy_data], axis=1)
features=features.drop('Churn', axis=1)
dot_data= tree.export_graphviz(decision_tree,out_file=None,
                filled=True, rounded=True,
                special_characters=True, feature_names=features.columns)

graph = pydotplus.graph_from_dot_data(dot_data)  
#Image(graph.create_png())"""

decision_tree = DecisionTreeClassifier(criterion="entropy", max_depth=3)
decision_tree  = decision_tree.fit(X_train,Y_train)
y_pred = decision_tree.predict(X_test)
acc_dt_new = accuracy_score(Y_test, y_pred)
print("Accuracy: {}".format(acc_dt_new))
print(classification_report(Y_test,y_pred))

#Support Vector Machine
from sklearn.svm import SVC
svc_cl = SVC(kernel = 'rbf', random_state = 0)
svc_cl.fit(X_train, Y_train)
y_pred = svc_cl.predict(X_test)
acc_svm = accuracy_score(Y_test, y_pred)
print("Accuracy: {}".format(acc_svm))
print()
print(classification_report(Y_test,y_pred))
cnf_matrix = metrics.confusion_matrix(Y_test,y_pred)
print(cnf_matrix)


#Naive Bayes 
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
y_pred = gaussian.predict(X_test)
acc_nb = accuracy_score(Y_test, y_pred)
print("Accuracy: {}".format(acc_nb))
print()
print(classification_report(Y_test,y_pred))
cnf_matrix = metrics.confusion_matrix(Y_test,y_pred)
print(cnf_matrix)

models_acc = pd.DataFrame({
    'Models': ['Decision Tree', 'Logistic Regression', 
               'Support Vector Machine','Naive Bayes'],
    'Accuracy': [acc_dt_new,acc_lg,acc_svm,acc_nb] })

print( models_acc.sort_values(by='Accuracy',ascending=False))
