# **Telecom Customer Churn Prediction**

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read dataset
df = pd.read_csv(r'C:\Users\asus\OneDrive\Desktop\Completed Final Project\Completed Final Project\cleaned_dataset.csv')

In [3]:
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,female,0,Yes,No,1.0,No,No,dsl,No,...,No,No,No,No,Monthly,Yes,Manual,29.85,29.85,No
1,5575-GNVDE,male,0,No,No,34.0,Yes,No,dsl,Yes,...,Yes,No,No,No,One year,No,Manual,56.95,1889.5,No
2,3668-QPYBK,male,0,No,No,2.0,Yes,No,dsl,Yes,...,No,No,No,No,Monthly,Yes,Manual,53.85,108.15,Yes
3,7795-CFOCW,male,0,No,No,45.0,No,No,dsl,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,female,0,No,No,2.0,Yes,No,fiber optic,No,...,No,No,No,No,Monthly,Yes,Manual,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   seniorcitizen     7043 non-null   int64  
 3   partner           7043 non-null   object 
 4   dependents        7043 non-null   object 
 5   tenure            7043 non-null   float64
 6   phoneservice      7043 non-null   object 
 7   multiplelines     7043 non-null   object 
 8   internetservice   7043 non-null   object 
 9   onlinesecurity    7043 non-null   object 
 10  onlinebackup      7043 non-null   object 
 11  deviceprotection  7043 non-null   object 
 12  techsupport       7043 non-null   object 
 13  streamingtv       7043 non-null   object 
 14  streamingmovies   7043 non-null   object 
 15  contract          7043 non-null   object 
 16  paperlessbilling  7043 non-null   object 


In [5]:
df.describe()

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges
count,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.365483,64.761692,2281.916928
std,0.368612,24.556766,30.090047,2265.270398
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,402.225
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3786.6
max,1.0,72.0,118.75,8684.8


In [6]:
df.isna().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [9]:
df.drop(columns=['customerid'], inplace=True)

In [10]:
df.churn.value_counts()

churn
No     5174
Yes    1869
Name: count, dtype: int64

### ***Encoding***


#### ***Label encoding***


#### ***Split the data to features and target***


In [11]:
# define y and X
y = df['churn']
X = df.drop(['churn'],axis=1)

#### ***Split the data to train and test***

In [12]:
# train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#### ***Encoding***

In [13]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in X_train.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le

### ***Standard scaling***


In [14]:
# standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
y.value_counts()

churn
No     5174
Yes    1869
Name: count, dtype: int64

#### ***Handle Unbalance data***


In [16]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42, k_neighbors=5) 

In [17]:
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [18]:
y_train.value_counts()

churn
No     4138
Yes    1496
Name: count, dtype: int64

In [19]:
y_train_smote.value_counts()

churn
No     4138
Yes    4138
Name: count, dtype: int64

In [20]:
y.value_counts()

churn
No     5174
Yes    1869
Name: count, dtype: int64

### ***Modeling***


In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model = LogisticRegression(random_state=42)

In [23]:
# train the model
model.fit(X_train_smote, y_train_smote)

In [24]:
# test the model
y_pred = model.predict(X_test)

In [25]:
# R2 Score between y_test and y_pred
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7565649396735273


In [26]:
# use joblib to save the model
import joblib
joblib.dump(model, "logistic_regression_model.pkl")
print("Model saved as logistic_regression_model.pkl")

Model saved as logistic_regression_model.pkl
