In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import seaborn as sns #for visualisation
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

### Importing the data

In [None]:
df=pd.read_csv("../input/telecom-churn/Churn.csv")

###                                    Understanding the data

In [None]:
df.head()

In [None]:
print ("\nFeatures : \n" ,df.columns.tolist())  #listing all the features

In [None]:
df.describe() 

The description didn't come out to be of much use because most of the columns are non numeric.
However for tenure and monthly charges median value(50%) may be useful in case of missing values.

##### Calculating the columnwise percentage of null values

In [None]:
Null_val = [(c, df[c].isna().mean()*100) for c in df]
Null_val = pd.DataFrame(Null_val, columns=["column_name", "percentage"])

In [None]:
Null_val

We observe that there are no NUll values in any column. So we will simply proceed

In [None]:
df.info()   

### Data Manipulation (Part 1)

We can see that most of the columns have dtype as "Objects". So they needed to be mapped with a numeric values.

It is to be observed that there are 21 columns and 7042 rows. TotalCharges contains numeric values but here the o/p comes to be object, we need to change that too.


In [None]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')   #changing to numeric

In [None]:
print ("\nUnique values for each column:\n",df.nunique())

In the output above, it is to be noted that for different columns there exists a set of unique values.
It is easy to interpret looking at customerID column that each value must be unique and hence there are 7043 values(same as the total number of rows) and for gender(here considering binary only) there exists 2 unique values; either male or female. And so on

In [None]:
df.drop(["customerID"],axis=1,inplace = True) #dropping CustomerID column because it has got nothing to do with analysis of Churn

In [None]:
df.head()

CHANGING NON NUMERIC COLUMNS TO NUMERIC


In [None]:
df.gender = [1 if each == "Male" else 0 for each in df.gender] #mapping male to 1 and female to 0

In [None]:
df.head()

CHANGING NON NUMERIC COLUMNS TO NUMERIC

In [None]:
df.gender[df.gender == 'male'] = 1
df.gender[df.gender == 'female'] = 0

#mapping male to 1 and female to 0


We realise that Columns Partner,Dependents, PhoneService,MultipleLines, OnlineSecurity,OnlineBackup,DeviceProtection, Techsupport, StreamingTV, StreamingMOvies,PaperlessBilling and Churn have a same set of unique values(i,e., "yes" or "no" plus one other) so we can map them to 0/1/-1 together

In [None]:
change_to_num = ['Partner', 
                      'Dependents', 
                      'PhoneService', 
                      'MultipleLines',
                      'OnlineSecurity',
                      'OnlineBackup',
                      'DeviceProtection',
                      'TechSupport',
                      'StreamingTV',
                      'StreamingMovies',
                      'PaperlessBilling', 
                      'Churn']

for x in change_to_num:
    df[x] = [1 if each == "Yes" else 0 if each == "No" else -1 for each in df[x]]
    
df.head()

Columns such as InternetService, contract and payement mode are useful in visualisation as mapped values of them to numerals would be confusing to see on graphs
So they'll be mapped after some visualisation

### Data Visualization

In [None]:

sns.countplot(x="Churn",data=df) #Visualising the distribution of Churn values


In [None]:
sns.pairplot(df,vars = ['tenure','MonthlyCharges','TotalCharges'], hue="Churn") 
#plotting the three numeric features with hue as "Churn" 
#Hue is a Variable in data to map plot aspects to different colors.


Observations from the above Plots:
1. Tenure is indirectly proportional to Churn
2. Monthly charges are directly proportional to churn

In [None]:
v=sns.catplot(x="Contract", y="Churn", data=df,kind="bar")
v.set_ylabels("Probability of Churn to be 1")
# All types of contract vs Churning probability

OBSERVATIONS:
1. Customers are more likely to stop using service when the contract is month to month
2. Least when a two year contract is made

In [None]:
u=sns.catplot(x="InternetService", y="Churn", data=df,kind="bar")
u.set_ylabels("Probability of churn to be 1")
#All types of IS vs CHurn probability

OBSERVATIONS:
1. Maximum probability of losing a customer is when they are using Fiber optic IS
2. Lowest is when they're using none

In [None]:
u=sns.catplot(x="TechSupport", y="Churn", data=df,kind="bar")
u.set_ylabels("Probability of churn")

Observation: In case of no tech support, customers have high probability to churn

In [None]:
u=sns.catplot(x="gender", y="Churn", data=df,kind="bar")
u.set_ylabels("Probabilty for churn to be 1")

Observation: Gender doesn't play a significant role in analysing whether a customer would churn

In [None]:
u=sns.catplot(x="SeniorCitizen", y="Churn", data=df,kind="bar")
u.set_ylabels("Churn Probability")

Observation: Younger people are likely to turn into Churn

In [None]:
u=sns.catplot(x="OnlineSecurity", y="Churn", data=df,kind="bar")
u.set_ylabels("Churn probability")

Observation: ONline security is directly proportional to churn probability

In [None]:
u=sns.catplot(x="DeviceProtection", y="Churn", data=df,kind="bar")
u.set_ylabels("Churning Probability")

Observation: Less device protection may lead to more churning

In [None]:
u=sns.catplot(x="PaperlessBilling", y="Churn", data=df,kind="bar")
u.set_ylabels("Churn Probability")

#### Conclusion: The above charts helped in analysing how these features affect Pobability of Churning.
Gender Feature had little impact as women were very less likely to churn


### Data Manipulation (part 2)

In [None]:
#Now we will map the remaining columns (InternetService, Contract, PaymentMethod)

In [None]:
df = pd.get_dummies(data=df)
df.head()


In [None]:
p=df.corr() #Finding the correlation between the columns so that I can remove one of two highly correlated column 
#Usually the values ranging from +/-0.5 to +/-1 are said to be highly correlated, so we will look for it

In [None]:
p

since there are so many values, we will sort them to check 

In [None]:
p['Churn'].sort_values() 

In [None]:
#No value is highly correlated, so we are good to go

In [None]:
df = df.reset_index()

In [None]:
y=df.Churn.values #storing Churn(which is to be predicted) in variable Y

In [None]:
df1=df.drop(["Churn"],axis=1) #dropping Churn column, so that we can be left with rest of the features

In [None]:
x = (df1-np.min(df1))/(np.max(df1)-np.min(df1)).values

The steps below were performed to avoid an error during implementation of models
"Input contains NaN, infinity or a value too large for dtype('float32')"


In [None]:
pd.isnull(x).sum() > 0  #finding the column where lies any null value

In [None]:
x=x.fillna(x.mean()) #replacing the null value with mean

In [None]:
np.any(np.isnan(x)) #checking whethetr a null value still exists in the dataframe

In [None]:
x = x[np.isfinite(x).all(1)]  #Only keeping finite values

In [None]:
np.all(np.isfinite(x)) #Checking whether all values are finite

In [None]:
print(x.astype(np.float32)) #to avoid any dtype error, finding the value that exceeds the bounds of a float 32 dtype

In [None]:
X = np.nan_to_num(x.astype(np.float32)) #bringing value in the bound of float 32 dtype
print(X)

Splitting into train and test dataframes

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state =1) 
#I've split the data in the ratio 80:20

### Implementing Machine Learning Models

In [None]:
from sklearn.tree import DecisionTreeClassifier         #Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train,y_train)
accuracy_dt = dt_model.score(x_test,y_test)
print("Decision Tree's accuracy:",accuracy_dt)


In [None]:
from sklearn.svm import SVC                             #SVM
svc_model = SVC(random_state = 1)
svc_model.fit(x_train,y_train)
accuracy_svc = svc_model.score(x_test,y_test)
print("Accuracy using SVM :",accuracy_svc)

In [None]:
from sklearn.naive_bayes import GaussianNB              #Naive Bayes
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)
accuracy_nb = nb_model.score(x_test,y_test)
print("Accuracy using Naive Bayes :",accuracy_nb)

In [None]:
from sklearn.linear_model import LogisticRegression    #Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)
accuracy_lr = lr_model.score(x_test,y_test)
print(" Accuracy using Logistic Regression is:",accuracy_lr)

In [None]:
from sklearn.neighbors import KNeighborsClassifier    #K-Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors = 3) #set K neighbor as 3
knn.fit(x_train,y_train)
predicted_y = knn.predict(x_test)
print("KNN accuracy when k=3:",knn.score(x_test,y_test))

In [None]:
arr1 = []
for each in range(1,25):
    knn_loop = KNeighborsClassifier(n_neighbors = each) #set K neighbor as 3
    knn_loop.fit(x_train,y_train)
    arr1.append(knn_loop.score(x_test,y_test))
    
plt.plot(range(1,25),arr1)
plt.xlabel("Range")
plt.ylabel("Score")
plt.show()


In [None]:
#KNN gives highest accuracy at k=16

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 16) #at k=16
knn_model.fit(x_train,y_train)
predicted_y = knn_model.predict(x_test)
accuracy_knn = knn_model.score(x_test,y_test)
print("KNN accuracy when K=16:",accuracy_knn)

In [None]:
from sklearn.ensemble import RandomForestClassifier     #Random Forest
rf_model_initial = RandomForestClassifier(n_estimators = 5, random_state = 1)
rf_model_initial.fit(x_train,y_train)
print("Random Forest accuracy for 7 trees is:",rf_model_initial.score(x_test,y_test))


In [None]:
arr = []   #plotting a graph to find best value of K that would give us maximum accuracy
for each in range(1,50):
    rf_loop = RandomForestClassifier(n_estimators = each, random_state = 1)
    rf_loop.fit(x_train,y_train)
    arr.append(rf_loop.score(x_test,y_test))
    
plt.plot(range(1,50),arr)
plt.xlabel("Range")
plt.ylabel("Score")
plt.show()

In [None]:
#Accuracy of RF is highest at 35 and 42

In [None]:
rf_model = RandomForestClassifier(n_estimators = 35, random_state = 1) 
rf_model.fit(x_train,y_train)
accuracy_rf = rf_model.score(x_test,y_test)
print("Random Forest accuracy for 35 trees is:",accuracy_rf)


### Model Evaluation

In [None]:
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report
cm_lr = confusion_matrix(y_test,lr_model.predict(x_test)) #for Logistic Regression
f, ax = plt.subplots(figsize = (5,5))
sns.heatmap(cm_lr, annot = True, linewidths = 0.5, color = "blue", fmt = ".0f", ax=ax)
plt.xlabel("y_predicted")
plt.ylabel("y_true")
plt.title("Confusion Matrix of LR")
plt.show()


In [None]:
def print_scores(headline, y_true, y_pred):
    print(headline)
    acc_score = accuracy_score(y_true, y_pred)
    print("accuracy: ",acc_score)
    pre_score = precision_score(y_true, y_pred)
    print("precision: ",pre_score)
    rec_score = recall_score(y_true, y_pred)                            
    print("recall: ",rec_score)
    f_score = f1_score(y_true, y_pred, average='weighted')
    print("f1_score: ",f_score)


In [None]:
print_scores("Logistic Regression;",y_test, lr_model.predict(x_test))
print_scores("SVC;",y_test, svc_model.predict(x_test))
print_scores("KNN;",y_test, knn_model.predict(x_test))
print_scores("Naive Bayes;",y_test, nb_model.predict(x_test))
print_scores("Decision Tree;",y_test, dt_model.predict(x_test))
print_scores("Random Forest;",y_test, rf_model.predict(x_test))

In [None]:
report = classification_report(y_test, lr_model.predict(x_test))  #Report of best performing LR model
print(report)

### Final Conclusions

Tenure is inversely proportional to Churn

Monthly charges is directly proportional to Churn

The RF model performed best when number of trees=35/42

The KNN model performed the best when k=16

Logistic Regression model gave highest accuracy

Decision Tree Performed the worst





##### Some steps to avoid Churn:
1. Engage with customers
2. Proving offers
3. Analysis of risk
4. Providing genuine and better service
5. Customer retention is more important than customer acquisition.