In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score
import joblib

In [6]:
# Load dataset
def load_data(file_path):
    return pd.read_excel(file_path, engine='pyxlsb')


file_path = '45K_Data_For_Churn_Prediction_Modelling.xlsb'
df = load_data(file_path)
df = clean_data(df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [7]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45419 entries, 0 to 45418
Data columns (total 27 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   smsid                                45419 non-null  int64  
 1   Pincode                              45419 non-null  int64  
 2   Email_Check                          45419 non-null  int64  
 3   RMN_Flag                             45419 non-null  int64  
 4   AON                                  45419 non-null  int64  
 5   LangaugeZone                         45419 non-null  object 
 6   LanguagePrintName                    45419 non-null  object 
 7   DASType                              45419 non-null  int64  
 8   Model_type                           45419 non-null  object 
 9   Boxtype                              45419 non-null  object 
 10  ModelName                            45419 non-null  object 
 11  BrandName                   

Unnamed: 0,smsid,Pincode,Email_Check,RMN_Flag,AON,LangaugeZone,LanguagePrintName,DASType,Model_type,Boxtype,...,RADA_COLOR,Active_Days_mar24,Active_Days_Jan24,Active_Days_Feb24,Online_Recharge_Count,Total_VAS_Count,Upgrade,DownGrade,OB_calls_in_last_1_month,BSP_Plan_Activated
0,2780601,781335,1,1,6,NORTH,HINDI/PUNJ,5,SD,MPEG-4,...,GREEN,26.0,26.0,26.0,11.0,0.0,0,0,1,0
1,2782701,641109,1,1,6,TAMIL,TAMIL,2,SD,MPEG-4,...,GREEN,30.0,30.0,29.0,5.0,0.0,0,0,1,0
2,2782801,401101,1,1,6,NORTH,MARATHI,2,HD,MPEG-4,...,RED,31.0,31.0,29.0,3.0,1.0,0,0,1,0
3,2785001,752108,1,1,6,NORTH,HINDI/PUNJ,5,HD,MPEG-4,...,GREEN,31.0,31.0,29.0,5.0,0.0,1,1,1,0
4,2785101,421201,1,1,6,NORTH,HINDI/PUNJ,2,HD,MPEG-4,...,YELLOW,31.0,31.0,29.0,6.0,0.0,0,0,1,0


In [8]:
# Handle missing values
def clean_data(df):
    threshold = 0.8 * len(df)
    df.dropna(axis=1, thresh=threshold, inplace=True)

    for col in df.select_dtypes(include=['number']).columns:
        df[col].fillna(df[col].median(), inplace=True)

    for col in df.select_dtypes(include=['object']).columns:
        df[col].fillna(df[col].mode()[0], inplace=True)

    return df

In [9]:
# Churn prediction label generation
def churn_prediction(df):
    activity_cols = ['Active_Days_Jan24', 'Active_Days_Feb24', 'Active_Days_mar24']
    df['Total_Active_Days'] = df[activity_cols].sum(axis=1)
    df['Churn_Prediction'] = df['Total_Active_Days'].apply(lambda x: 'Active' if x > 20 else 'Non-Active')
    return df

df = churn_prediction(df)

#Prerit Bhageria Project

In [10]:
# Feature processing
def preprocess_data(df):
    X = df[['Active_Days_Jan24', 'Active_Days_Feb24', 'Active_Days_mar24']]
    y = LabelEncoder().fit_transform(df['Churn_Prediction'])
    X = StandardScaler().fit_transform(X)
    return train_test_split(X, y, test_size=0.2, random_state=42)


X_train, X_test, y_train, y_test = preprocess_data(df)

# Model training and evaluation
models = {
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True),
    'Logistic Regression': LogisticRegression()
}

best_model, best_score = None, 0
for name, model in models.items():
    model.fit(X_train, y_train)
    score = accuracy_score(y_test, model.predict(X_test))
    print(f'{name} Accuracy: {score:.4f}')
    if score > best_score:
        best_model, best_score = model, score

KNN Accuracy: 0.9999
SVM Accuracy: 0.9999
Logistic Regression Accuracy: 0.9999


In [11]:
# Hyperparameter tuning
if isinstance(best_model, KNeighborsClassifier):
    param_grid = {'n_neighbors': [3, 5, 7]}
elif isinstance(best_model, SVC):
    param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
elif isinstance(best_model, LogisticRegression):
    param_grid = {'C': [0.1, 1, 10]}
else:
    param_grid = {}

if param_grid:
    grid_search = GridSearchCV(best_model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print(f'Best Hyperparameters: {grid_search.best_params_}')

Best Hyperparameters: {'n_neighbors': 7}


In [19]:
from sklearn.metrics import classification_report, confusion_matrix

# Get predictions
y_pred = best_model.predict(X_test)

# Check if the model supports probability predictions
if hasattr(best_model, "predict_proba"):
    y_prob = best_model.predict_proba(X_test)[:, 1]  # Get probability of being "Active"
    print("\nPredicted Probabilities (first 10 rows):\n", y_prob[:10])

# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print results
print(f'Final Model: {best_model.__class__.__name__}')
print(f'Mean Absolute Error: {mae}')
print(f'Accuracy: {accuracy}')
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)




Predicted Probabilities (first 10 rows):
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Final Model: KNeighborsClassifier
Mean Absolute Error: 0.00011008366358432408
Accuracy: 0.9998899163364157

Confusion Matrix:
 [[9081    0]
 [   1    2]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      9081
           1       1.00      0.67      0.80         3

    accuracy                           1.00      9084
   macro avg       1.00      0.83      0.90      9084
weighted avg       1.00      1.00      1.00      9084



In [13]:
# Save the best model
joblib.dump(best_model, 'best_churn_model.pkl')

['best_churn_model.pkl']