# Codtech — Data Analyst Internship

 **Task 2 — Predictive analysis using machine learning**

**Objective:** Build a classification model to predict customer churn. Notebook demonstrates feature selection, model training, and evaluation. Dataset: synthetic ~10,000 rows.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, RocCurveDisplay
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('/mnt/data/codtech_task2_dataset.csv')
print('Dataset shape:', df.shape)
display(df.head())

In [None]:
# EDA checks
print('Missing values per column:\n', df.isna().sum())
print('\nTarget distribution (churn = 1):\n', df['churn'].value_counts(normalize=True))
display(df.describe().T)

# Simple plots
plt.figure()
df['age'].hist(bins=25)
plt.title('Age distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

plt.figure()
df['balance_usd'].hist(bins=30)
plt.title('Balance distribution')
plt.xlabel('Balance (USD)')
plt.ylabel('Count')
plt.show()

In [None]:
# Prepare features and target
X = df.drop(columns=['churn'])
y = df['churn']

numeric_features = ['credit_score', 'age', 'tenure_years', 'balance_usd', 'num_products',
                    'estimated_salary_usd', 'monthly_spend_usd', 'transactions_last_month',
                    'mobile_app_usage_min_per_day', 'campaigns_responded']

categorical_features = ['geography', 'gender']

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = OneHotEncoder(drop='first', sparse=False)

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

# Fit-transform to get feature matrix for selection demonstration
X_pre = preprocessor.fit_transform(X)
# Build feature names
ohe = preprocessor.named_transformers_['cat']
cat_cols = list(ohe.get_feature_names_out(categorical_features))
feature_names = numeric_features + cat_cols
X_pre.shape, len(feature_names)

In [None]:
# Feature selection with SelectKBest
selector = SelectKBest(score_func=f_classif, k=8)
selector.fit(X_pre, y)
mask = selector.get_support()
selected_features = [f for f, m in zip(feature_names, mask) if m]
selected_features

In [None]:
# Split before creating final pipeline (to avoid data leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

# Create final pipeline including preprocessor and model placeholder (we'll swap model)
pipe_lr = Pipeline(steps=[('pre', preprocessor), ('sel', SelectKBest(score_func=f_classif, k=8)), ('clf', LogisticRegression(max_iter=1000))])
pipe_rf = Pipeline(steps=[('pre', preprocessor), ('sel', SelectKBest(score_func=f_classif, k=8)), ('clf', RandomForestClassifier(random_state=42))])

# Train Logistic Regression
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)
y_proba_lr = pipe_lr.predict_proba(X_test)[:,1]
print('Logistic Regression - Accuracy:', accuracy_score(y_test, y_pred_lr))
print('Logistic Regression - ROC AUC:', roc_auc_score(y_test, y_proba_lr))
print('\nClassification report:\n', classification_report(y_test, y_pred_lr))

In [None]:
# Train Random Forest with a small GridSearch for demonstration
param_grid = {'clf__n_estimators': [100, 200], 'clf__max_depth': [6, 12]}
grid = GridSearchCV(pipe_rf, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train, y_train)
print('Best params:', grid.best_params_)
best_rf = grid.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_proba_rf = best_rf.predict_proba(X_test)[:,1]
print('Random Forest - Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Random Forest - ROC AUC:', roc_auc_score(y_test, y_proba_rf))
print('\nClassification report:\n', classification_report(y_test, y_pred_rf))

In [None]:
# ROC curve comparison
plt.figure()
RocCurveDisplay.from_estimator(pipe_lr, X_test, y_test)
plt.title('ROC - Logistic Regression')
plt.show()

plt.figure()
RocCurveDisplay.from_estimator(best_rf, X_test, y_test)
plt.title('ROC - Random Forest')
plt.show()

# Confusion matrix for best_rf
cm = confusion_matrix(y_test, y_pred_rf)
print('Confusion matrix:\n', cm)

In [None]:
# Save the best model as an example
import joblib
model_path = '/mnt/data/codtech_task2_best_model.pkl'
joblib.dump(best_rf, model_path)
print('Saved best model to', model_path)

## Conclusion & Next steps

- Random Forest (after small tuning) generally performed better in ROC AUC.
- Next steps: more thorough hyperparameter tuning, cross-validation, calibration, testing on real data, and feature importance analysis.
