In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load preprocessed data
file_path = '/Users/rajhabib/customer-churn-prediction/data/preprocessed_data.csv'
df = pd.read_csv(file_path)

# Separate features (X) and target variable (y)
X = df.drop(['customerID', 'Churn'], axis=1)  # Exclude non-numeric and target columns

# Encode categorical features using OneHotEncoder or get_dummies
X_encoded = pd.get_dummies(X, drop_first=True)  # Adjust drop_first as needed

# Encode target variable 'Churn' into numeric labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Churn'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Define parameters for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]  # Removed 'auto'
}

# Perform grid search
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate best model on test data
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model Accuracy: {accuracy:.2f}")

# Cross-validation with best model
scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", scores)
print("Mean CV Accuracy:", scores.mean())


Best Model Accuracy: 0.81
Cross-Validation Scores: [0.82342502 0.81543922 0.79236912 0.79680568 0.79928952]
Mean CV Accuracy: 0.8054657124259851
