In [1]:
!pip install catboost
!pip install xgboost
!pip install lightgbm

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, precision_score,recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier



from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv('/content/Telco_Customer_Churn.csv')
data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
df = data[['gender',	'tenure',	'InternetService','Contract','MonthlyCharges','TotalCharges','Churn']]

In [6]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [7]:
y = LabelEncoder().fit_transform(y)
y

array([0, 0, 1, ..., 0, 1, 0])

In [8]:
X.head()

Unnamed: 0,gender,tenure,InternetService,Contract,MonthlyCharges,TotalCharges
0,Female,1,DSL,Month-to-month,29.85,29.85
1,Male,34,DSL,One year,56.95,1889.5
2,Male,2,DSL,Month-to-month,53.85,108.15
3,Male,45,DSL,One year,42.3,1840.75
4,Female,2,Fiber optic,Month-to-month,70.7,151.65


In [9]:
numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include="object").columns
print(numeric_features)
print(categorical_features)

Index(['tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')
Index(['gender', 'InternetService', 'Contract'], dtype='object')


In [10]:
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include="object").columns.tolist()

num_pipline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipline = Pipeline(steps=[
    ("encoder", OrdinalEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipline, numeric_features),
    ("cat", cat_pipline, categorical_features)
])

In [11]:
preprocessor

In [12]:
X_trs = preprocessor.fit_transform(X)


In [13]:
X_trs.shape

(7043, 6)

In [None]:
import pickle
pickle.dump(preprocessor, open('preprocessor7.pkl', 'wb'))

In [14]:
smt = SMOTEENN()
X_resampled, y_resampled = smt.fit_resample(X_trs, y)
X_resampled.shape, y_resampled.shape

((6154, 6), (6154,))

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X_resampled, y_resampled ,test_size=0.2,random_state = 42)

In [16]:
model1 = KNeighborsClassifier()

param_grids = {
   'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9]
    },}

In [17]:
gs = GridSearchCV(model1, param_grids['KNeighborsClassifier'], cv=3)
gs.fit(X_train,y_train)
model1.set_params(**gs.best_params_)
model1.fit(X_train,y_train)

In [18]:
y_pred = model1.predict(X_test)

In [22]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9861900893582454


In [None]:
import pickle
pickle.dump(model1, open('model8.pkl', 'wb'))