In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from datetime import datetime

In [9]:
# 🔹 Load your data
df = pd.read_csv("preprocessed_train_data.csv")

In [10]:
df

Unnamed: 0,year_of_purchase,age_of_vehicle,odometer_reading,last_service_date,last_service_type,number_of_services,last_service_kms,avg_kms_per_month,next_service_due_kms,next_service_due_date,...,service_center_Velachery,warranty_status_Active,warranty_status_Expired,insurance_status_Active,insurance_status_Expired,AMC_status_No,AMC_status_Yes,preferred_language_English,preferred_language_Hindi,preferred_language_Tamil
0,2019,-0.247899,-0.206041,2024-10-04,Minor,0.560456,56063,811,66063,2025-02-04,...,0,0,1,0,1,1,0,0,0,1
1,2019,-0.247899,-1.082204,2025-05-01,Major,-0.907667,30554,443,40554,2025-10-28,...,0,0,1,1,0,1,0,0,0,1
2,2020,-0.692162,-0.519720,2025-04-03,Minor,0.927486,46881,813,56881,2025-09-30,...,0,0,1,0,1,1,0,1,0,0
3,2019,-0.247899,0.601719,2024-11-18,Minor,0.193425,82509,1149,92509,2025-05-17,...,0,0,1,0,1,1,0,1,0,0
4,2015,1.529153,0.398341,2024-12-27,Major,-0.540636,73001,642,83001,2025-06-25,...,0,0,1,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2015,1.529153,1.174563,2025-02-16,Major,-0.907667,99377,838,109377,2025-08-15,...,0,0,1,1,0,0,1,0,1,0
996,2016,1.084890,0.923895,2024-10-14,Minor,-0.907667,90539,860,100539,2025-12-04,...,1,0,1,1,0,0,1,0,0,1
997,2021,-1.136425,-0.416348,2025-04-14,Minor,-0.907667,47837,1076,57837,2025-11-10,...,0,0,1,1,0,1,0,0,0,1
998,2015,1.529153,0.031489,2024-08-26,Minor,1.294517,64525,549,74525,2025-02-22,...,0,0,1,1,0,1,0,0,0,1


In [11]:
# 🔹 Handle datetime columns
date_cols = ['last_service_date', 'next_service_due_date']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[f"{col}_days_ago"] = (datetime.today() - df[col]).dt.days
    df.drop(columns=[col], inplace=True)

In [12]:
df

Unnamed: 0,year_of_purchase,age_of_vehicle,odometer_reading,last_service_type,number_of_services,last_service_kms,avg_kms_per_month,next_service_due_kms,next_service_due_days,pending_service,...,warranty_status_Expired,insurance_status_Active,insurance_status_Expired,AMC_status_No,AMC_status_Yes,preferred_language_English,preferred_language_Hindi,preferred_language_Tamil,last_service_date_days_ago,next_service_due_date_days_ago
0,2019,-0.247899,-0.206041,Minor,0.560456,56063,811,66063,357,No,...,1,0,1,1,0,0,0,1,291,168
1,2019,-0.247899,-1.082204,Major,-0.907667,30554,443,40554,296,Yes,...,1,1,0,1,0,0,0,1,82,-98
2,2020,-0.692162,-0.519720,Minor,0.927486,46881,813,56881,210,No,...,1,0,1,1,0,1,0,0,110,-70
3,2019,-0.247899,0.601719,Minor,0.193425,82509,1149,92509,180,Yes,...,1,0,1,1,0,1,0,0,246,66
4,2015,1.529153,0.398341,Major,-0.540636,73001,642,83001,180,No,...,1,1,0,0,1,0,1,0,207,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2015,1.529153,1.174563,Major,-0.907667,99377,838,109377,180,Yes,...,1,1,0,0,1,0,1,0,156,-24
996,2016,1.084890,0.923895,Minor,-0.907667,90539,860,100539,180,No,...,1,1,0,0,1,0,0,1,281,-135
997,2021,-1.136425,-0.416348,Minor,-0.907667,47837,1076,57837,180,No,...,1,1,0,1,0,0,0,1,99,-111
998,2015,1.529153,0.031489,Minor,1.294517,64525,549,74525,180,No,...,1,1,0,1,0,0,0,1,330,150


In [13]:
# 🔹 Drop or encode problematic columns
drop_cols = ['customer_id', 'name', 'email', 'mobile_number', 'follow_up_date']
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

In [14]:
df

Unnamed: 0,year_of_purchase,age_of_vehicle,odometer_reading,last_service_type,number_of_services,last_service_kms,avg_kms_per_month,next_service_due_kms,next_service_due_days,pending_service,...,warranty_status_Expired,insurance_status_Active,insurance_status_Expired,AMC_status_No,AMC_status_Yes,preferred_language_English,preferred_language_Hindi,preferred_language_Tamil,last_service_date_days_ago,next_service_due_date_days_ago
0,2019,-0.247899,-0.206041,Minor,0.560456,56063,811,66063,357,No,...,1,0,1,1,0,0,0,1,291,168
1,2019,-0.247899,-1.082204,Major,-0.907667,30554,443,40554,296,Yes,...,1,1,0,1,0,0,0,1,82,-98
2,2020,-0.692162,-0.519720,Minor,0.927486,46881,813,56881,210,No,...,1,0,1,1,0,1,0,0,110,-70
3,2019,-0.247899,0.601719,Minor,0.193425,82509,1149,92509,180,Yes,...,1,0,1,1,0,1,0,0,246,66
4,2015,1.529153,0.398341,Major,-0.540636,73001,642,83001,180,No,...,1,1,0,0,1,0,1,0,207,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2015,1.529153,1.174563,Major,-0.907667,99377,838,109377,180,Yes,...,1,1,0,0,1,0,1,0,156,-24
996,2016,1.084890,0.923895,Minor,-0.907667,90539,860,100539,180,No,...,1,1,0,0,1,0,0,1,281,-135
997,2021,-1.136425,-0.416348,Minor,-0.907667,47837,1076,57837,180,No,...,1,1,0,1,0,0,0,1,99,-111
998,2015,1.529153,0.031489,Minor,1.294517,64525,549,74525,180,No,...,1,1,0,1,0,0,0,1,330,150


In [15]:
# 🔹 Encode categorical features
categorical_cols = df.select_dtypes(include='object').columns.drop('pending_service')
df = pd.get_dummies(df, columns=categorical_cols)

In [16]:
# 🔹 Encode target variable
df['pending_service'] = df['pending_service'].map({'Yes': 1, 'No': 0})

In [17]:
df

Unnamed: 0,year_of_purchase,age_of_vehicle,odometer_reading,number_of_services,last_service_kms,avg_kms_per_month,next_service_due_kms,next_service_due_days,pending_service,call_duration_sec,...,feedback_date_2025-09-11,feedback_date_2025-09-12,feedback_date_2025-10-01,feedback_date_2025-10-04,feedback_date_2025-10-05,feedback_date_2025-10-06,feedback_date_2025-10-08,feedback_date_2025-10-09,feedback_date_2025-10-10,feedback_date_2025-10-11
0,2019,-0.247899,-0.206041,0.560456,56063,811,66063,357,0,197,...,0,0,0,0,0,0,0,0,0,0
1,2019,-0.247899,-1.082204,-0.907667,30554,443,40554,296,1,138,...,0,0,0,0,0,0,0,0,0,0
2,2020,-0.692162,-0.519720,0.927486,46881,813,56881,210,0,297,...,0,0,0,0,0,0,0,0,0,0
3,2019,-0.247899,0.601719,0.193425,82509,1149,92509,180,1,233,...,0,0,0,0,0,0,0,0,0,0
4,2015,1.529153,0.398341,-0.540636,73001,642,83001,180,0,278,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2015,1.529153,1.174563,-0.907667,99377,838,109377,180,1,77,...,0,0,0,0,0,0,0,0,0,0
996,2016,1.084890,0.923895,-0.907667,90539,860,100539,180,0,215,...,0,0,0,0,0,0,0,0,0,0
997,2021,-1.136425,-0.416348,-0.907667,47837,1076,57837,180,0,62,...,0,0,0,0,0,0,0,0,0,0
998,2015,1.529153,0.031489,1.294517,64525,549,74525,180,0,53,...,0,0,0,0,0,0,0,0,0,0


In [18]:
import warnings
warnings.filterwarnings('ignore')

# 🔹 Define features and target
X = df.drop(columns=['pending_service'])
y = df['pending_service']

# 🔹 Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# 1. Random Forest Classifier
print("🌲 Random Forest Classifier")
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print(classification_report(y_test, rf_preds))

🌲 Random Forest Classifier
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       103
           1       0.90      0.88      0.89        97

    accuracy                           0.90       200
   macro avg       0.90      0.89      0.89       200
weighted avg       0.90      0.90      0.89       200



In [20]:
# 2. XGBoost Classifier
try:
    print("⚡ XGBoost Classifier")
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb.fit(X_train, y_train)
    xgb_preds = xgb.predict(X_test)
    print(classification_report(y_test, xgb_preds))
except ModuleNotFoundError as e:
    print("🛑 Please install xgboost package using: pip install xgboost")

⚡ XGBoost Classifier
              precision    recall  f1-score   support

           0       0.92      0.89      0.91       103
           1       0.89      0.92      0.90        97

    accuracy                           0.91       200
   macro avg       0.91      0.91      0.90       200
weighted avg       0.91      0.91      0.91       200



In [21]:
# 3. Logistic Regression
print("📈 Logistic Regression")
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
log_preds = logreg.predict(X_test)
print(classification_report(y_test, log_preds))

📈 Logistic Regression
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       103
           1       0.84      0.87      0.85        97

    accuracy                           0.85       200
   macro avg       0.85      0.86      0.85       200
weighted avg       0.86      0.85      0.86       200



In [22]:
# Train Model
clf = XGBClassifier()
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [23]:
import pickle

# Save the trained model to a file
with open("final_classification_model.pkl", "wb") as file:
    pickle.dump(clf, file)

In [24]:
# Load model
with open("final_classification_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

# Make predictions
preinput = X_test[:5]  # Example input
result = loaded_model.predict(preinput)

print("Prediction:", result)

Prediction: [1 0 1 0 1]
