In [1]:
import os
os.chdir("../")
%pwd

'c:\\Users\\rayjohndp\\Desktop\\Projects\\DS_Python_Portfolio\\Telecom_Churn_Project'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    Xtrain_data_path: Path
    Xtest_data_path: Path
    ytrain_data_path: Path
    ytest_data_path: Path
    model_name: Path
    alpha: float
    l1_ratio: float
    target_column: str 

In [3]:
from churnPrediction.constants import *
from churnPrediction.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            target_column = schema.name
        )

        return model_trainer_config

    

Explore different types of model

In [64]:
import pandas as pd
import os
from churnPrediction import logger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report





In [50]:
X_train = pd.read_csv("artifacts/data_transformation/X_train.csv")
X_test  = pd.read_csv("artifacts/data_transformation/X_test.csv") 
y_train  = pd.read_csv("artifacts/data_transformation/y_train.csv")
y_test  = pd.read_csv("artifacts/data_transformation/y_test.csv")

In [51]:
#Do Logistic Regression as our base model

logistic_model = LogisticRegression()
logistic_model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [52]:
#Accuracy test

predictions = logistic_model.predict(X_test)


In [53]:
accuracy =accuracy_score(y_test,predictions)
accuracy

0.8508856682769727

In [31]:
#Check confusion matrix

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.87      0.82      0.85      1555
         Yes       0.83      0.88      0.85      1550

    accuracy                           0.85      3105
   macro avg       0.85      0.85      0.85      3105
weighted avg       0.85      0.85      0.85      3105



In [27]:
cv = KFold(n_splits = 10, random_state = 720, shuffle=True)
cv_model = LogisticRegression()

scores = cross_val_score(cv_model,X_train, y_train, scoring = 'accuracy',cv = cv,n_jobs =-1 )
mean_score = np.mean(scores)
confidence_interval = np.std(scores) * 2

mean_score, confidence_interval


(0.8508919794246523, 0.01843384139490854)

In [28]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

dtree_prediction = clf.predict(X_test)

dtree_accuracy = accuracy_score(y_test, dtree_prediction)
dtree_accuracy

0.8547504025764895

In [29]:
dtree_cv_model = DecisionTreeClassifier()
scores = cross_val_score(dtree_cv_model, X_train, y_train, scoring = 'accuracy', cv = cv, n_jobs = -1)
mean_score = np.mean(scores)
confidence_interval = np.std(scores) * 2
mean_score, confidence_interval

(0.8459274147456659, 0.03281258286164771)

In [32]:
print(classification_report(y_test, dtree_prediction))

              precision    recall  f1-score   support

          No       0.85      0.86      0.86      1555
         Yes       0.86      0.85      0.85      1550

    accuracy                           0.85      3105
   macro avg       0.85      0.85      0.85      3105
weighted avg       0.85      0.85      0.85      3105



In [39]:
basic_rf = RandomForestClassifier()

basic_rf.fit(X_train,y_train)


rf_prediction = basic_rf.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_prediction)





print(f'Train Accuracy - :{basic_rf.score(X_train,y_train):.3f}')
print(f'Test Accuracy - :{basic_rf.score(X_test,y_test):.3f}')
print(str(rf_accuracy))

  return fit_method(estimator, *args, **kwargs)


Train Accuracy - :0.999
Test Accuracy - :0.892
0.8921095008051529


In [40]:
scores = cross_val_score(basic_rf, X_train, y_train, scoring = 'accuracy', cv = cv, n_jobs = -1)
mean_score = np.mean(scores)
confidence_interval = np.std(scores) * 2
mean_score, confidence_interval

(0.8927275671556487, 0.014867998775635022)

In [41]:
print(classification_report(y_test, rf_prediction))

              precision    recall  f1-score   support

          No       0.89      0.89      0.89      1555
         Yes       0.89      0.89      0.89      1550

    accuracy                           0.89      3105
   macro avg       0.89      0.89      0.89      3105
weighted avg       0.89      0.89      0.89      3105



In [66]:
#Hyperparameter tuning Random Forest

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]

#Criterion
criterion = ['entropy','gini']

#Number of features to consider at every split
max_features = ['log2','sqrt']

#Maximum number of levels in tree
max_depth = [2,4]

#Minimum number of samples require to split a node
min_samples_split = [2,5]

#Minimum number of samples required at each leaf node
min_samples_leaf = [1,2]

#Method of selecting samples for training each tree
bootstrap = [True, False]

param_grid = {
    'n_estimators' : n_estimators,
    'criterion':criterion,
    'max_features' : max_features,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
    'bootstrap' : bootstrap,
}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'criterion': ['entropy', 'gini'], 'max_features': ['log2', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [44]:
rf_Model_hype = RandomForestClassifier()

rf_Grid = GridSearchCV(estimator = rf_Model_hype, param_grid = param_grid, cv = 3, verbose = 2,n_jobs = 4)

In [45]:
rf_Grid.fit(X_train, y_train)

Fitting 3 folds for each of 640 candidates, totalling 1920 fits


  return fit_method(estimator, *args, **kwargs)


In [46]:
print(rf_Grid.best_params_)
print(rf_Grid.best_score_)

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}
0.8420530000120072


In [47]:
print(f'Train Accuracy - :{rf_Grid.score(X_train,y_train):.3f}')
print(f'Test Accuracy - :{rf_Grid.score(X_test,y_test):.3f}')

Train Accuracy - :0.830
Test Accuracy - :0.831


In [67]:
Random_rf_Model_hype = RandomForestClassifier()
random_search = RandomizedSearchCV(estimator=Random_rf_Model_hype, param_distributions=param_grid, n_iter=100, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)
# Perform the random search
random_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = random_search.best_params_
best_estimator = random_search.best_estimator_

# Evaluate the model with the best parameters
y_pred = best_estimator.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best parameters: {best_params}")
print(f"Accuracy with best parameters: {accuracy:.2f}")


Best parameters: {'n_estimators': 17, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'gini', 'bootstrap': True}
Accuracy with best parameters: 0.84


  return fit_method(estimator, *args, **kwargs)


In [54]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)

xgb_prediction = xgb_model.predict(X_test)
xgb_accuracy_score = accuracy_score(y_test,xgb_prediction)
xgb_accuracy_score

0.8962962962962963

In [55]:
scores_xgb = cross_val_score(xgb_model,X_train, y_train, scoring = 'accuracy',cv = cv,n_jobs =-1 )
mean_score_xgb = np.mean(scores_xgb)
confidence_interval_xgb = np.std(scores_xgb) * 2

mean_score_xgb, confidence_interval_xgb

(0.895488283482568, 0.01841876328000079)

In [56]:
xg_params = {
    'learing_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
    'max_depth': [3,4,5,6,8,10,12,15],
    'min_child_weight': [1,3,5,7],
    'gamma' : [0.0,0.1,0.2,0.3,0.4],
    'colsample_bytree' : [0.3,0.4,0.5,0.7]
}

xg_grid = GridSearchCV(xgb_model, xg_params, cv =2,verbose = 2,n_jobs = 4)


In [57]:
xg_grid.fit(X_train, y_train)

Fitting 2 folds for each of 3840 candidates, totalling 7680 fits


Parameters: { "learing_rate" } are not used.



In [58]:
print(xg_grid.best_params_)
print(xg_grid.best_score_)

print(f'Train Accuracy - :{xg_grid.score(X_train,y_train):.3f}')
print(f'Test Accuracy - :{xg_grid.score(X_test,y_test):.3f}')

{'colsample_bytree': 0.3, 'gamma': 0.2, 'learing_rate': 0.05, 'max_depth': 5, 'min_child_weight': 1}
0.8928617666959303
Train Accuracy - :0.943
Test Accuracy - :0.895


In [60]:
#Adaboost

adb_clf = AdaBoostClassifier()
adb_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = adb_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with default settings: {accuracy:.2f}")

  y = column_or_1d(y, warn=True)


Accuracy with default settings: 0.87


In [63]:
#With Hyperparameter turning
adb_clf_hype = AdaBoostClassifier(estimator=DecisionTreeClassifier())
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1],
    'estimator__max_depth': [1, 2, 3]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=adb_clf_hype, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Make predictions with the best estimator
y_pred = best_estimator.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Best parameters: {best_params}")
print(f"Accuracy with best parameters: {accuracy:.2f}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best parameters: {'estimator__max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 100}
Accuracy with best parameters: 0.89
