#Setup

In [60]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

#Dataset

In [42]:
churn_dataset = pd.read_csv('https://raw.githubusercontent.com/sujin-lifology/customer-churn/main/telecom_customer_churn.csv')

churn_DatFrame = pd.DataFrame(churn_dataset)

# Information about the dataset

### No of Rows and Columns in the dataset

In [43]:
churn_dataset.shape

(7043, 38)

### Having a look at the first view rows of the dataset

In [44]:
churn_dataset.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


### Statistical Properties of our dataset

In [45]:
churn_dataset.describe()

Unnamed: 0,Age,Number of Dependents,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,6361.0,5517.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,46.509726,0.468692,93486.070567,36.197455,-119.756684,1.951867,32.386767,25.420517,26.189958,63.596131,2280.381264,1.962182,6.860713,749.099262,3034.379056
std,16.750352,0.962802,1856.767505,2.468929,2.154425,3.001199,24.542061,14.200374,19.586585,31.204743,2266.220462,7.902614,25.104978,846.660055,2865.204542
min,19.0,0.0,90001.0,32.555828,-124.301372,0.0,1.0,1.01,2.0,-10.0,18.8,0.0,0.0,0.0,21.36
25%,32.0,0.0,92101.0,33.990646,-121.78809,0.0,9.0,13.05,13.0,30.4,400.15,0.0,0.0,70.545,605.61
50%,46.0,0.0,93518.0,36.205465,-119.595293,0.0,29.0,25.69,21.0,70.05,1394.55,0.0,0.0,401.44,2108.64
75%,60.0,0.0,95329.0,38.161321,-117.969795,3.0,55.0,37.68,30.0,89.75,3786.6,0.0,0.0,1191.1,4801.145
max,80.0,9.0,96150.0,41.962127,-114.192901,11.0,72.0,49.99,85.0,118.75,8684.8,49.79,150.0,3564.72,11979.34


# Data Pre-Processing

### Dropping Unwanted Row

In [46]:
churn_dataset = churn_dataset[churn_dataset['Customer Status'] != 'Joined']

### Dropping Unwanted Columns

In [47]:
churn_dataset = churn_dataset.drop('Latitude', axis=1)
churn_dataset = churn_dataset.drop('Longitude', axis=1)
churn_dataset = churn_dataset.drop('City', axis=1)
churn_dataset = churn_dataset.drop('Offer', axis=1)
churn_dataset = churn_dataset.drop('Avg Monthly Long Distance Charges', axis=1)
churn_dataset = churn_dataset.drop('Online Security', axis=1)
churn_dataset = churn_dataset.drop('Online Backup', axis=1)
churn_dataset = churn_dataset.drop('Contract', axis=1)
churn_dataset = churn_dataset.drop('Internet Type', axis=1)
churn_dataset = churn_dataset.drop('Payment Method', axis=1)
churn_dataset = churn_dataset.drop('Total Refunds', axis=1)
churn_dataset = churn_dataset.drop('Total Extra Data Charges', axis=1)
churn_dataset = churn_dataset.drop('Churn Category', axis=1)
churn_dataset = churn_dataset.drop('Churn Reason', axis=1)
churn_dataset = churn_dataset.drop('Customer ID', axis=1)
churn_dataset = churn_dataset.drop('Gender', axis=1)

### Changing some values to Numerical Values

In [48]:
churn_dataset['Customer Status'] = churn_dataset['Customer Status'].replace({'Churned': 1, 'Stayed': 0})
churn_dataset['Married'] = churn_dataset['Married'].map({'Yes': 1, 'No': 0})
churn_dataset['Phone Service'] = churn_dataset['Phone Service'].map({'Yes': 1, 'No': 0})
churn_dataset['Multiple Lines'] = churn_dataset['Multiple Lines'].map({'Yes': 1, 'No': 0})
churn_dataset['Internet Service'] = churn_dataset['Internet Service'].map({'Yes': 1, 'No': 0})
churn_dataset['Device Protection Plan'] = churn_dataset['Device Protection Plan'].map({'Yes': 1, 'No': 0})
churn_dataset['Premium Tech Support'] = churn_dataset['Premium Tech Support'].map({'Yes': 1, 'No': 0})
churn_dataset['Streaming TV'] = churn_dataset['Streaming TV'].map({'Yes': 1, 'No': 0})
churn_dataset['Streaming Movies'] = churn_dataset['Streaming Movies'].map({'Yes': 1, 'No': 0})
churn_dataset['Streaming Music'] = churn_dataset['Streaming Music'].map({'Yes': 1, 'No': 0})
churn_dataset['Unlimited Data'] = churn_dataset['Unlimited Data'].map({'Yes': 1, 'No': 0})
churn_dataset['Paperless Billing'] = churn_dataset['Paperless Billing'].map({'Yes': 1, 'No': 0})

### Filling Missing values

In [49]:
columns_to_fill = ['Multiple Lines', 'Avg Monthly GB Download', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data']

for col in columns_to_fill:
    if col == 'Avg Monthly GB Download':
        churn_dataset[col] = churn_dataset[col].fillna(churn_dataset[col].std())
    else:
        churn_dataset[col] = churn_dataset[col].fillna(churn_dataset[col].median())


In [50]:
churn_dataset

Unnamed: 0,Age,Married,Number of Dependents,Zip Code,Number of Referrals,Tenure in Months,Phone Service,Multiple Lines,Internet Service,Avg Monthly GB Download,...,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Paperless Billing,Monthly Charge,Total Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,37,1,0,93225,2,9,1,0.0,1,16.000000,...,1.0,0.0,0.0,1.0,1,65.60,593.30,381.51,974.81,0
1,46,0,0,91206,0,9,1,1.0,1,10.000000,...,0.0,1.0,1.0,0.0,0,-4.00,542.40,96.21,610.28,0
2,50,0,0,92627,0,4,1,0.0,1,30.000000,...,0.0,0.0,0.0,1.0,1,73.90,280.85,134.60,415.45,1
3,78,1,0,94553,1,13,1,0.0,1,4.000000,...,1.0,1.0,0.0,1.0,1,98.00,1237.85,361.66,1599.51,1
4,75,1,0,93010,3,3,1,0.0,1,11.000000,...,1.0,0.0,0.0,1.0,1,83.90,267.40,22.14,289.54,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7037,36,0,0,92028,0,4,1,0.0,0,19.565724,...,1.0,1.0,0.0,1.0,0,20.95,85.50,8.04,93.54,1
7038,20,0,0,91941,0,13,1,0.0,1,59.000000,...,0.0,0.0,1.0,1.0,0,55.15,742.90,606.84,1349.74,0
7039,40,1,0,95367,1,22,1,1.0,1,17.000000,...,0.0,1.0,1.0,1.0,1,85.10,1873.70,356.40,2230.10,1
7041,21,1,0,92075,5,67,1,0.0,1,58.000000,...,0.0,1.0,1.0,1.0,0,67.85,4627.65,142.04,4769.69,0


## Now Over Dataset looks good now We will train our dataset

# Training Dataset

### Splitting Input and output

In [51]:
X_churn_data = churn_dataset.drop(columns = 'Customer Status', axis = 1)
Y_churn_data = churn_dataset['Customer Status']

### Splitting data for testing and training

In [53]:
X_training,X_testing,Y_training,Y_testing =  train_test_split(X_churn_data,Y_churn_data,test_size = 0.2, stratify=Y_churn_data, random_state=2)

### Applying Standardization

In [55]:
scaler = StandardScaler()

scaler.fit(X_training)

X_train_scaled = scaler.transform(X_training)
X_test_scaled = scaler.transform(X_testing)

### Training Model

In [57]:
classifier = svm.SVC(kernel='linear')

#training SVM Classifier
classifier.fit(X_training, Y_training)

# checking accuracy score for churn training data
X_churn_training_prediction = classifier.predict(X_training)
training_churn_data_accuracy = accuracy_score(X_churn_training_prediction, Y_training)

print('Accuracy score for traininig churn data : ', training_churn_data_accuracy)

# checking accuracy score for churn testing data
X_churn_testing_prediction = classifier.predict(X_testing)
testing_churn_data_accuracy = accuracy_score(X_churn_testing_prediction, Y_testing)

print('Accuracy score for testing churn data : ', testing_churn_data_accuracy)

Accuracy score for traininig churn data :  0.8283058243217606
Accuracy score for testing churn data :  0.8186646433990895


### Evaluation Matrix

In [62]:
train_precision = precision_score(Y_training, X_churn_training_prediction)
train_recall = recall_score(Y_training, X_churn_training_prediction)
train_f1_score = f1_score(Y_training, X_churn_training_prediction)
train_conf_matrix = confusion_matrix(Y_training, X_churn_training_prediction)

# Calculate evaluation metrics for testing data
test_precision = precision_score(Y_testing, X_churn_testing_prediction)
test_recall = recall_score(Y_testing, X_churn_testing_prediction)
test_f1_score = f1_score(Y_testing, X_churn_testing_prediction)
test_conf_matrix = confusion_matrix(Y_testing, X_churn_testing_prediction)

### Evaluation Matrix Results

In [63]:
print("Evaluation metrics for training data:")
print("Precision:", train_precision)
print("Recall:", train_recall)
print("F1 Score:", train_f1_score)
print("Confusion Matrix:\n", train_conf_matrix)

print("\nEvaluation metrics for testing data:")
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1 Score:", test_f1_score)
print("Confusion Matrix:\n", test_conf_matrix)

Evaluation metrics for training data:
Precision: 0.7017783857729138
Recall: 0.6862876254180602
F1 Score: 0.6939465674670274
Confusion Matrix:
 [[3340  436]
 [ 469 1026]]

Evaluation metrics for testing data:
Precision: 0.6839237057220708
Recall: 0.6711229946524064
F1 Score: 0.6774628879892037
Confusion Matrix:
 [[828 116]
 [123 251]]
