# Data Reading

In [2]:
import pandas as pd

In [3]:
import polars as pl

In [4]:
df = pl.read_csv(r'D:\CHURN PREDICTION FINAL YEAR PROJECT\dataset\02 Churn-Dataset - Copy.csv')

In [5]:
df.head()

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,Churn
str,str,i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,i64,i64,str
"""7590-VHVEG""","""Female""",0,"""Yes""","""No""",1,"""No""","""No phone service""","""DSL""","""No""","""Yes""","""No""","""No""","""No""","""No""","""Month-to-month""","""Yes""","""Electronic check""",29.85,29.85,0,0,"""No"""
"""5575-GNVDE""","""Male""",0,"""No""","""No""",34,"""Yes""","""No""","""DSL""","""Yes""","""No""","""Yes""","""No""","""No""","""No""","""One year""","""No""","""Mailed check""",56.95,1889.5,0,0,"""No"""
"""3668-QPYBK""","""Male""",0,"""No""","""No""",2,"""Yes""","""No""","""DSL""","""Yes""","""Yes""","""No""","""No""","""No""","""No""","""Month-to-month""","""Yes""","""Mailed check""",53.85,108.15,0,0,"""Yes"""
"""7795-CFOCW""","""Male""",0,"""No""","""No""",45,"""No""","""No phone service""","""DSL""","""Yes""","""No""","""Yes""","""Yes""","""No""","""No""","""One year""","""No""","""Bank transfer (automatic)""",42.3,1840.75,0,3,"""No"""
"""9237-HQITU""","""Female""",0,"""No""","""No""",2,"""Yes""","""No""","""Fiber optic""","""No""","""No""","""No""","""No""","""No""","""No""","""Month-to-month""","""Yes""","""Electronic check""",70.7,151.65,0,0,"""Yes"""


# Transformations

# dropping unwanted column

In [6]:
df = df.drop('customerID')

In [7]:
df.schema

Schema([('gender', String),
        ('SeniorCitizen', Int64),
        ('Partner', String),
        ('Dependents', String),
        ('tenure', Int64),
        ('PhoneService', String),
        ('MultipleLines', String),
        ('InternetService', String),
        ('OnlineSecurity', String),
        ('OnlineBackup', String),
        ('DeviceProtection', String),
        ('TechSupport', String),
        ('StreamingTV', String),
        ('StreamingMovies', String),
        ('Contract', String),
        ('PaperlessBilling', String),
        ('PaymentMethod', String),
        ('MonthlyCharges', Float64),
        ('TotalCharges', Float64),
        ('numAdminTickets', Int64),
        ('numTechTickets', Int64),
        ('Churn', String)])

# converting my polars df to pandas df 
#### (better compaitability with sk-learn)

In [8]:
df = df.to_pandas()
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,...,No,No,One year,No,Mailed check,56.95,1889.5,0,0,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0,0,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,...,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,3,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,0,0,Yes


# seperating feature & target variables

In [9]:
x = df.drop(['Churn'],axis=1)
y = df['Churn']

In [10]:
x.head(n=1)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,numAdminTickets,numTechTickets
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0


In [11]:
y.head(n=3)

0     No
1     No
2    Yes
Name: Churn, dtype: object

# Train test split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)

In [14]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4930, 21), (2113, 21), (4930,), (2113,))

# seperating numerical and categorical columns in x 

In [15]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4930 entries, 1695 to 860
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            4930 non-null   object 
 1   SeniorCitizen     4930 non-null   int64  
 2   Partner           4930 non-null   object 
 3   Dependents        4930 non-null   object 
 4   tenure            4930 non-null   int64  
 5   PhoneService      4930 non-null   object 
 6   MultipleLines     4930 non-null   object 
 7   InternetService   4930 non-null   object 
 8   OnlineSecurity    4930 non-null   object 
 9   OnlineBackup      4930 non-null   object 
 10  DeviceProtection  4930 non-null   object 
 11  TechSupport       4930 non-null   object 
 12  StreamingTV       4930 non-null   object 
 13  StreamingMovies   4930 non-null   object 
 14  Contract          4930 non-null   object 
 15  PaperlessBilling  4930 non-null   object 
 16  PaymentMethod     4930 non-null   object 
 17

In [16]:
# Assuming x_train is a pandas DataFrame
num_features = x_train.select_dtypes(include=['number']).columns.tolist()
cat_features = x_train.select_dtypes(exclude=['number']).columns.tolist()

print("Numerical Features:", num_features)
print("Categorical Features:", cat_features)


Numerical Features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'numAdminTickets', 'numTechTickets']
Categorical Features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


# Building Preprocessing Pipelines

# numerical and categorical transformatin

In [17]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [18]:
numerical_transformer = Pipeline(
                                    steps=[
                                            ('imputer', SimpleImputer(strategy='median')),
                                            ('scaler',StandardScaler())

                                          ]
                                )

categorical_transformer = Pipeline(
                                    steps=[
                                            ('scaler', OneHotEncoder(handle_unknown='ignore'))
                                          ]
                                  )

In [19]:
preprocessor = ColumnTransformer(
                                    transformers=[
                                                    ('num', numerical_transformer, num_features),
                                                    ('cat', categorical_transformer, cat_features)
                                                 ]
                               )


In [20]:
preprocessor

# Building Prediction Pipeline along with preprocessing

# Decision Tree Algorithm

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, recall_score, accuracy_score, confusion_matrix

In [22]:
model_dt = Pipeline(
                    steps=[
                            ('preprocessing', preprocessor),
                            
                            ('ml_model', DecisionTreeClassifier(
                                                                criterion = "gini",random_state = 100,
                                                                max_depth=6, min_samples_leaf=8
                                                                )
                            )
                          ]
                   )

In [23]:
model_dt.fit(x_train, y_train)

In [24]:
y_pred = model_dt.predict(x_test)
y_pred

array(['Yes', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [25]:
print(classification_report(y_test, y_pred, labels=["Yes","No"]))

              precision    recall  f1-score   support

         Yes       0.70      0.75      0.73       574
          No       0.90      0.88      0.89      1539

    accuracy                           0.85      2113
   macro avg       0.80      0.82      0.81      2113
weighted avg       0.85      0.85      0.85      2113



# Randomforest algorithms

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, accuracy_score, confusion_matrix

In [33]:
preprocessor

In [34]:
# Create the pipeline with preprocessing and RandomForestClassifier
model_forest = Pipeline(
    steps=[
        ('preprocessing', preprocessor),
        ('ml_model', RandomForestClassifier(
            max_depth=10,
            max_features='sqrt',
            min_samples_leaf=1,
            min_samples_split=5,
            n_estimators=150,
            random_state=42
        ))
    ]
)

In [35]:
model_forest.fit(x_train, y_train)

In [36]:
y_pred = model_forest.predict(x_test)
y_pred

array(['Yes', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [37]:
print(classification_report(y_test, y_pred, labels=["Yes","No"]))

              precision    recall  f1-score   support

         Yes       0.75      0.70      0.73       574
          No       0.89      0.91      0.90      1539

    accuracy                           0.86      2113
   macro avg       0.82      0.81      0.82      2113
weighted avg       0.85      0.86      0.86      2113



## gradient boosting

In [38]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, recall_score, accuracy_score, confusion_matrix

In [39]:
model_gboost = Pipeline(
                        steps=[
                                ('preprocessing', preprocessor),
                                ('ml_model', GradientBoostingClassifier())
                              ]
                       )

In [40]:
model_gboost

In [41]:
model_gboost.fit(x_train,y_train)

In [42]:
y_pred = model_gboost.predict(x_test)

y_pred

array(['Yes', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [43]:
print(classification_report(y_test, y_pred, labels=["Yes","No"]))

              precision    recall  f1-score   support

         Yes       0.76      0.74      0.75       574
          No       0.90      0.91      0.91      1539

    accuracy                           0.87      2113
   macro avg       0.83      0.83      0.83      2113
weighted avg       0.86      0.87      0.86      2113



# ada boosting

In [44]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, recall_score, accuracy_score, confusion_matrix

In [45]:
preprocessor

In [46]:
model_adaboost = Pipeline(
                            steps=[
                                    ('preprocessing', preprocessor),
                                    ('ml_model', AdaBoostClassifier())
                                  ]
                       )

model_adaboost

In [47]:
model_adaboost.fit(x_train, y_train)



In [48]:
y_pred = model_adaboost.predict(x_test)

y_pred

array(['Yes', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [49]:
print(classification_report(y_test, y_pred, labels=["Yes","No"]))

              precision    recall  f1-score   support

         Yes       0.74      0.71      0.73       574
          No       0.89      0.91      0.90      1539

    accuracy                           0.86      2113
   macro avg       0.82      0.81      0.81      2113
weighted avg       0.85      0.86      0.85      2113



### Among all the models gradient boost performs well with 87 % accracy

In [50]:
import joblib

In [51]:
from joblib import dump, load

# To save the model:
dump(model_gboost, 'GB.joblib')

# To load the model:
model = load('GB.joblib')


## --------------------------------------------------------------------------------------------------------------

# hyperparameter tunning

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [49]:
preprocessor

In [25]:
x_train_transformed = preprocessor.fit_transform(x_train)
x_test_transformed = preprocessor.transform(x_test)

## Random forest

In [None]:


# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# Create the model
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search
grid_search =  GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
    

grid_search.fit(x_train_transformed, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_}")

  return fit_method(estimator, *args, **kwargs)


Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}
Best Accuracy: 0.8505062643799866


# Gradient Boosting

In [29]:
# Define a smaller and more optimized parameter grid
param_distributions = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}

# Initialize the model
gb_model = GradientBoostingClassifier(random_state=42)

# Perform RandomizedSearch with 20 iterations and 5-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_distributions,
    n_iter=20,  # Number of combinations to try
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search.fit(x_train_transformed, y_train)

# Display best parameters and accuracy
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Accuracy: {random_search.best_score_:.4f}")

  y = column_or_1d(y, warn=True)


Best Parameters: {'subsample': 1.0, 'n_estimators': 150, 'max_features': 'sqrt', 'max_depth': 7, 'learning_rate': 0.05}
Best Accuracy: 0.8586
