In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# Load data
df = pd.read_csv('customer_churn.csv')

# Initial exploration
print(df.head())
print(df.info())
print(df.describe())


FileNotFoundError: [Errno 2] No such file or directory: 'customer_churn.csv'

In [None]:
# Check for missing values
print(df.isnull().sum())

# Impute or drop based on business logic
df = df.fillna(df.median(numeric_only=True))  # for numeric columns
df = df.fillna('Unknown')  # for categorical columns (optional)


In [None]:
# Identify categorical columns
cat_cols = df.select_dtypes(include=['object']).columns

# Apply Label Encoding for binary categories
label_enc = LabelEncoder()
for col in cat_cols:
    if df[col].nunique() == 2:
        df[col] = label_enc.fit_transform(df[col])
        
# Apply OneHotEncoding for other categorical features
df = pd.get_dummies(df, drop_first=True)


In [None]:
X = df.drop('Churn', axis=1)  # Target variable is assumed to be 'Churn'
y = df['Churn'].apphkfyfly(lambda x: 1 if x == 'Yes' or x == 1 else 0)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Initial Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Params (GridSearch):", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


In [None]:
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

print("Best Params (Random Search):", random_search.best_params_)
print("Best Score:", random_search.best_score_)


In [None]:
importances = pd.Series(grid_search.best_estimator_.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=importances.index)
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.tight_layout()
plt.show()


In [None]:
# Logistic Regression with L2 (Ridge)
ridge_model = LogisticRegression(penalty='l2', solver='liblinear')
ridge_model.fit(X_train, y_train)
print("Ridge Logistic Regression Accuracy:", ridge_model.score(X_test, y_test))

# Logistic Regression with L1 (Lasso)
lasso_model = LogisticRegression(penalty='l1', solver='liblinear')
lasso_model.fit(X_train, y_train)
print("Lasso Logistic Regression Accuracy:", lasso_model.score(X_test, y_test))


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))  # Dropout layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
final_model = grid_search.best_estimator_

# Predictions
final_pred = final_model.predict(X_test)

# Metrics
print("Final Accuracy:", accuracy_score(y_test, final_pred))
print("Precision:", precision_score(y_test, final_pred))
print("Recall:", recall_score(y_test, final_pred))
print("F1-Score:", f1_score(y_test, final_pred))
print("Classification Report:\n", classification_report(y_test, final_pred))
