In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

# Load Dataset
df = pd.read_excel("data.xlsx")
df.drop(columns=['Unnamed: 0'], inplace=True)  # Drop extra index column

# Convert problematic columns to numeric
df['day.charge'] = pd.to_numeric(df['day.charge'], errors='coerce')
df['eve.mins'] = pd.to_numeric(df['eve.mins'], errors='coerce')

# Fill missing values with median
df.loc[:, 'day.charge'] = df['day.charge'].fillna(df['day.charge'].median())
df.loc[:, 'eve.mins'] = df['eve.mins'].fillna(df['eve.mins'].median())


# Remove outliers
def remove_outliers(df, feature_cols):
    for col in feature_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

df_numeric = df.select_dtypes(include=['number'])
df = remove_outliers(df, df_numeric.columns)

# Remove multicollinear features
def find_multicollinear_features(dataset, threshold=0.7):
    r = dataset.corr()
    col_corr = set()
    for i in range(len(r.columns)):
        for j in range(i):
            if abs(r.iloc[i, j]) > threshold:
                col_corr.add(r.columns[i])
    return col_corr

high_corr_features = find_multicollinear_features(df_numeric)
df.drop(columns=high_corr_features, inplace=True)

# Standardizing numerical features
num_cols = ['account.length', 'voice.messages', 'intl.mins', 'intl.calls',
            'day.mins', 'day.calls', 'eve.mins', 'eve.calls', 'night.mins',
            'night.calls', 'customer.calls']

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Label encoding for binary columns
binary_cols = ['voice.plan', 'intl.plan', 'churn']
encoder = LabelEncoder()
for col in binary_cols:
    df[col] = encoder.fit_transform(df[col])

# One-hot encoding for 'state'
df = pd.get_dummies(df, columns=['state'], drop_first=True)

# One-hot encoding for 'area.code' (✅ Using a cleaner prefix)
df = pd.get_dummies(df, columns=['area.code'], prefix="area_code")

# Save feature names to maintain consistency during inference
training_columns = df.drop(columns=['churn']).columns

# Train Gradient Boosting Model
X = df.drop(columns=['churn'])
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Save model, scaler & feature names
joblib.dump(gb_model, "gb.pkl")
joblib.dump(scaler, "scaler0.pkl")
joblib.dump(list(training_columns), "feature_names.pkl")  # ✅ Save feature names

print("✅ Model, Scaler & Feature Names Saved!")


✅ Model, Scaler & Feature Names Saved!
