In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import missingno
from scipy import stats
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import pickle
import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.shape

(7043, 21)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
df.drop("customerID",axis=1,inplace=True)

In [8]:
for i in df.columns:
    print(i,":",df[i].unique())
    print()

gender : ['Female' 'Male']

SeniorCitizen : [0 1]

Partner : ['Yes' 'No']

Dependents : ['No' 'Yes']

tenure : [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]

PhoneService : ['No' 'Yes']

MultipleLines : ['No phone service' 'No' 'Yes']

InternetService : ['DSL' 'Fiber optic' 'No']

OnlineSecurity : ['No' 'Yes' 'No internet service']

OnlineBackup : ['Yes' 'No' 'No internet service']

DeviceProtection : ['No' 'Yes' 'No internet service']

TechSupport : ['No' 'Yes' 'No internet service']

StreamingTV : ['No' 'Yes' 'No internet service']

StreamingMovies : ['No' 'Yes' 'No internet service']

Contract : ['Month-to-month' 'One year' 'Two year']

PaperlessBilling : ['Yes' 'No']

PaymentMethod : ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']

MonthlyCharges : [29.85 56.95 53.

In [9]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print(f"Missing values in TotalCharges: {df['TotalCharges'].isnull().sum()}")

Missing values in TotalCharges: 11


In [10]:
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

In [11]:
X = df.drop("Churn",axis = 1)
y = df["Churn"]

In [12]:
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print()
print(f"Target distribution:{y.value_counts()}")

Features shape: (7043, 19)
Target shape: (7043,)

Target distribution:Churn
No     5174
Yes    1869
Name: count, dtype: int64


In [13]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [14]:
X[["OnlineSecurity",'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies']] = X[["OnlineSecurity",'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV','StreamingMovies']].replace('No internet service',"No")

In [15]:
X["MultipleLines"].replace('No phone service',"No",inplace=True)

In [16]:
ohe_cols = ["gender", "PaymentMethod"] + [i for i in X.columns if "Yes" in X[i].unique()]

In [17]:
ohe = OneHotEncoder(drop="first",sparse_output=False)
ohe_df = pd.DataFrame(ohe.fit_transform(X[ohe_cols]),columns=ohe.get_feature_names_out())

In [18]:
ord_inet = OrdinalEncoder(categories=[['No','DSL', 'Fiber optic']]) 
X["InternetService"] = ord_inet.fit_transform(X[["InternetService"]])

In [19]:
ord_contract = OrdinalEncoder(categories=[['Month-to-month', 'One year', 'Two year']]) 
X["Contract"]=ord_contract.fit_transform(X[["Contract"]])

In [20]:
X = pd.concat([X.drop(ohe_cols, axis=1).reset_index(drop=True),ohe_df.reset_index(drop=True)], axis=1)

In [21]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=42)

In [23]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
from sklearn.model_selection import KFold

In [24]:
knn = KNeighborsClassifier(n_neighbors=19)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}")

Accuracy: 80.13


In [29]:
kfold = KFold(n_splits=5,shuffle=False)

scores =[]

for train_id, validation_id in kfold.split(X_train):
    knn.fit(X_train[train_id], y_train[train_id])
    preds = knn.predict(X_train[validation_id])
    scores.append(accuracy_score(y_train[validation_id], preds))

print("KFold Accuracies:", scores)
print(f"Average Accuracy: {np.mean(scores)*100:.2f}%")

KFold Accuracies: [0.7799467613132209, 0.7923691215616682, 0.771960958296362, 0.7630878438331854, 0.7761989342806395]
Average Accuracy: 77.67%


In [25]:
X.columns

Index(['SeniorCitizen', 'tenure', 'InternetService', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'gender_Male',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_Yes', 'OnlineSecurity_Yes', 'OnlineBackup_Yes',
       'DeviceProtection_Yes', 'TechSupport_Yes', 'StreamingTV_Yes',
       'StreamingMovies_Yes', 'PaperlessBilling_Yes'],
      dtype='object')

Accuracy: 80.13


In [26]:
X_train.shape

(5634, 21)

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

In [29]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}")

Accuracy: 74.38


In [30]:
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [31]:
# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(dt,

#                    filled=True)

In [32]:
from sklearn.svm import SVC

In [33]:
svc = SVC(kernel = "linear")
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)

In [34]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}")

Accuracy: 82.04


In [35]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [36]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}")

Accuracy: 82.19


In [37]:
print(confusion_matrix(y_test,y_pred))

[[936 100]
 [151 222]]


In [38]:
((936+222)/(936+222+100+151))*100

82.18594748048261

In [39]:
# Create Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=1000,          # Number of trees
    max_depth=10,              # Maximum depth
    max_features='sqrt',       # Features per split
    min_samples_split=7,       # Min samples to split
    min_samples_leaf=1,        # Min samples in leaf
    bootstrap=True,            # Use bootstrap sampling
    oob_score=True,            # Calculate OOB error
    random_state=42,
    n_jobs=-1                  # Use all processors
)

rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}")

Accuracy: 81.26


In [40]:
ada_model = AdaBoostClassifier()
ada_model.fit(X_train,y_train)
y_pred = ada_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}")

Accuracy: 81.19


In [41]:
gb_model = AdaBoostClassifier()
gb_model.fit(X_train,y_train)
y_pred = gb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}")

Accuracy: 81.19


In [44]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores = []

for train_idx, test_idx in kf.split(X_train):
    lr.fit(X_train[train_idx], y_train[train_idx])
    preds = lr.predict(X_train[test_idx])
    scores.append(accuracy_score(y_train[test_idx], preds))

In [45]:
print("KFold Accuracies:", scores)
print("Mean Accuracy:", np.mean(scores))

KFold Accuracies: [0.8047914818101154, 0.7994676131322094, 0.7897071872227152, 0.7923691215616682, 0.8117229129662522]
Mean Accuracy: 0.799611663338592


In [46]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = DecisionTreeClassifier()
scores = []

for train_idx, test_idx in skf.split(X_train, y_train):
    model.fit(X_train[train_idx], y_train[train_idx])
    preds = model.predict(X_train[test_idx])
    scores.append(accuracy_score(y_train[test_idx], preds))

print("StratifiedKFold Accuracies:", scores)


StratifiedKFold Accuracies: [0.712511091393079, 0.7329192546583851, 0.7391304347826086, 0.7417923691215617, 0.7175843694493783]


In [47]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
model = DecisionTreeClassifier()
scores = []

for train_idx, test_idx in loo.split(X_train):
    model.fit(X_train[train_idx], y_train[train_idx])
    preds = model.predict(X_train[test_idx])
    scores.append(accuracy_score(y_train[test_idx], preds))

print("LOOCV Mean Accuracy:", np.mean(scores))


LOOCV Mean Accuracy: 0.7243521476748314


In [48]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
model = DecisionTreeClassifier()
scores = []

for train_idx, test_idx in ss.split(X_train):
    model.fit(X_train[train_idx], y_train[train_idx])
    preds = model.predict(X_train[test_idx])
    scores.append(accuracy_score(y_train[test_idx], preds))

print("ShuffleSplit Accuracies:", scores)

ShuffleSplit Accuracies: [0.7293700088731144, 0.7346938775510204, 0.7417923691215617, 0.7338065661047027, 0.707187222715173]


In [51]:
from sklearn.model_selection import GridSearchCV

params = {
    "max_depth": [2, 3, 4, 5, 10, None],
    "criterion": ["gini", "entropy"]
}

grid = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

Best Params: {'criterion': 'gini', 'max_depth': 5}
Best Score: 0.7875399723562295


In [52]:
grid = GridSearchCV(
    estimator=[DecisionTreeClassifier(),LogisticRegression()],
    param_grid=params,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ("model", SVC())   
])

params = [
    {
        "model": [LogisticRegression()],
        "model__C": [0.1, 1, 10],
        "model__max_iter":[1000,500,2000,5000]
    },
    {
        "model": [SVC()],
        "model__C": [0.1, 1, 10],
        "model__kernel": ["linear", "rbf"]
    },
    {
        "model": [RandomForestClassifier()],
        "model__n_estimators": [50, 100],
        "model__max_depth": [3, 5, None]
    }
]

grid = GridSearchCV(pipe, params, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best Model:", grid.best_estimator_)
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Model: Pipeline(steps=[('model', LogisticRegression(C=10, max_iter=1000))])
Best Params: {'model': LogisticRegression(), 'model__C': 10, 'model__max_iter': 1000}
Best Score: 0.8010293127985614


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

params = {
    "max_depth": [2, 3, 4, 5, None],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10)
}

rand = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(),
    param_distributions=params,
    n_iter=20,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

rand.fit(X, y)

print("Best Params:", rand.best_params_)
print("Best Score:", rand.best_score_)


In [2]:
X.columns

NameError: name 'X' is not defined

In [40]:
pickle.dump(knn, open("knn.pkl", "wb"))
pickle.dump(dt, open("dt.pkl", "wb"))
pickle.dump(svc, open("svc.pkl", "wb"))

pickle.dump(ohe, open("ohe.pkl", "wb"))
pickle.dump(ord_inet, open("ord_internet.pkl", "wb"))
pickle.dump(ord_contract, open("ord_contract.pkl", "wb"))

pickle.dump(scaler, open("scaler.pkl", "wb"))
pickle.dump(label_encoder, open("label_encoder.pkl", "wb"))


In [41]:
pickle.dump(list(X.columns), open("feature_order.pkl", "wb"))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle
import warnings
warnings.filterwarnings("ignore")

# Load data
df = pd.read_csv("Telco-Customer-Churn.csv")

# Fix TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Separate features and target
X = df.drop(["Churn", "customerID"], axis=1)  # FIX: Drop customerID
y = df["Churn"]

# Replace MultipleLines
X["MultipleLines"].replace('No phone service', "No", inplace=True)

# One-Hot Encoding
ohe_cols = ["gender", "PaymentMethod"] + [i for i in X.columns if "Yes" in X[i].unique()]
ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_df = pd.DataFrame(ohe.fit_transform(X[ohe_cols]), columns=ohe.get_feature_names_out())

# Ordinal Encoding
ord_inet = OrdinalEncoder(categories=[['No', 'DSL', 'Fiber optic']]) 
X["InternetService"] = ord_inet.fit_transform(X[["InternetService"]])

ord_contract = OrdinalEncoder(categories=[['Month-to-month', 'One year', 'Two year']]) 
X["Contract"] = ord_contract.fit_transform(X[["Contract"]])

# Concatenate
X = pd.concat([X.drop(ohe_cols, axis=1).reset_index(drop=True), ohe_df.reset_index(drop=True)], axis=1)

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# Scale features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# GridSearchCV
pipe = Pipeline([("model", SVC())])

params = [
    {
        "model": [LogisticRegression()],
        "model__C": [0.1, 1, 10],
        "model__max_iter": [1000, 500, 2000, 5000]
    },
    {
        "model": [SVC(probability=True)],  # FIX: Added probability=True for predict_proba
        "model__C": [0.1, 1, 10],
        "model__kernel": ["linear", "rbf"]
    },
    {
        "model": [RandomForestClassifier()],
        "model__n_estimators": [50, 100],
        "model__max_depth": [3, 5, None]
    }
]

grid = GridSearchCV(pipe, params, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best Model:", grid.best_estimator_)
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

# Test accuracy
y_pred = grid.predict(X_test)
from sklearn.metrics import accuracy_score
print("Test Accuracy:", accuracy_score(y_test, y_pred))

# FIX: Pickle the correct objects
pickle.dump(grid.best_estimator_, open("best_model.pkl", "wb"))
pickle.dump(ohe, open("ohe.pkl", "wb"))
pickle.dump(ord_inet, open("ord_internet.pkl", "wb"))
pickle.dump(ord_contract, open("ord_contract.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))
pickle.dump(label_encoder, open("label_encoder.pkl", "wb"))
pickle.dump(list(X.columns), open("feature_order.pkl", "wb"))

print("\nAll models and preprocessing objects saved successfully!")

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [None]:
import streamlit
!streamlit run app.py