In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import numpy as np

# Load the dataset
data = pd.read_csv("customer_churn.csv")

# Check for and remove duplicate rows
data.drop_duplicates(inplace=True)

# Save the 'Names' column
names = data['Names']

# Drop the 'Names' column
data.drop(columns=['Names'], inplace=True)

# Convert all categorical features to numeric using Label Encoding
label_encoder = LabelEncoder()
for col in data.select_dtypes(include=['object']):
    data[col] = label_encoder.fit_transform(data[col])

# Separate features and target variable
X = data.drop(columns=['Churn'])
y = data['Churn']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define preprocessing steps
numeric_features = X.columns.tolist()

# Pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
    ('scaler', StandardScaler()), # Scale features
     # Select top k features
    ('pca', PCA(n_components=X.shape[1])) # Perform PCA
])

# Preprocessor for numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Define the pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', GradientBoostingClassifier())])

# Define hyperparameters to tune
param_grid = {
    'classifier__learning_rate': [0.1, 0.05, 0.01],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7]
}

# Perform randomized search to find best hyperparameters
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, cv=StratifiedKFold(n_splits=5), 
                                   scoring='accuracy', verbose=1, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get best model
best_model = random_search.best_estimator_

# Fit best model on full training data
best_model.fit(X_train, y_train)

# Predict on test set
y_pred = best_model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Add the 'Names' column back to y_test
y_test_with_names = pd.concat([names[X_test.index], y_test], axis=1)

# Export results to CSV
y_test_with_names.to_csv("predicted_churn.csv", index=False)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy: 0.8888888888888888
