# Capstone Project - Modelling
**Hazırlayan:** Zeynep Sarı

In [1]:
import pandas as pd

In [2]:
# Load the data

data_path = "preprocessed_data.parquet"
df = pd.read_parquet(data_path)
df.head()


Unnamed: 0_level_0,age,tenure,avg_call_duration,data_usage,roaming_usage,monthly_charge,overdue_payments,auto_payment,avg_top_up_count,call_drops,...,satisfaction_score,churn,CüzdanX,HızlıPazar,Konuşalım,RitimGo,İzleGo,service_type_Broadband,service_type_Postpaid,service_type_Prepaid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6549e0e5-8e85-4037-b52e-31dd3dec6624,23,45.0,14.27,146.62,4.7,1761.84,0,0.499828,74,13.0,...,5.76,0,0,0,0,0,0,0,0,1
e7091181-b875-4e74-bc34-992cd37e7dee,32,15.0,60.511764,104.61,29.991167,366.11,1,0.0,0,10.002964,...,7.67,0,0,0,0,0,0,1,0,0
26e3bdf7-0b20-4790-b7a8-5b626b80033a,33,16.0,31.66,28.5,19.75,582.74,4,1.0,0,20.0,...,3.59,0,0,0,1,0,1,0,1,0
78f92350-7189-44be-8304-3ece87c88de7,19,14.0,60.511764,100.065695,29.991167,425.17,1,0.0,0,10.002964,...,9.71,0,0,0,0,1,0,1,0,0
95aa23c8-68ec-4c33-8d10-5ed0678ff4c6,44,249.0,77.84,100.065695,15.15,843.2,5,0.0,0,17.0,...,9.34,0,0,0,0,0,0,0,1,0


## Data Prep

In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [4]:
# Split features and target
X = df.drop(columns=['churn'])
y = df['churn']
X.shape, y.shape

((10000000, 20), (10000000,))

In [5]:
# Split the data and Normalize
train_test_split_ratio = 0.2


X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=train_test_split_ratio
)

# Normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
# Check new class distribution
print(f"Class distribution after SMOTE: {Counter(y_train_resampled)}")

X_train_resampled.shape, X_test_scaled.shape, y_train_resampled.shape, y_test.shape

Class distribution after SMOTE: Counter({0: 7893447, 1: 7893447})


((15786894, 20), (2000000, 20), (15786894,), (2000000,))

## Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [7]:
# Train the model
model = LogisticRegression(solver="saga", C=0.1)
model.fit(X_train_resampled, y_train_resampled)

# Evaluate on Test Set
y_pred = model.predict(X_test_scaled)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nTest Set Evaluation:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy Score: {accuracy:.2f}")


Test Set Evaluation:
Precision: 0.02
Recall: 0.82
F1 Score: 0.05
Accuracy Score: 0.55


## Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer, f1_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# fit decision tree
model = DecisionTreeClassifier()

# hyperparameters
param_grid = {
    'max_depth': [5, 10],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 5],
}
num_fold = 5

f1_scorer = make_scorer(f1_score)

# Set up GridSearchCV to find the best parameters based on F1 score
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, cv=num_fold, n_jobs=-1, verbose=2)

# Fit the model with grid search
grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters found by GridSearchCV
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")
best_model = grid_search.best_estimator_

# Evaluate on the Test Set
y_pred = best_model.predict(X_test_scaled)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nTest Set Evaluation:")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy Score: {accuracy:.2f}")

# Plot feature importances
feature_importances = best_model.feature_importances_

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=X_train.columns)  # Assuming X_train is a DataFrame
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

Fitting 5 folds for each of 8 candidates, totalling 40 fits


## Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, make_scorer
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# fit Random Forest model
model = RandomForestClassifier()

# Hyperparameters for GridSearch
param_grid = {
    'n_estimators': [200, 1000],  # Number of trees in the forest
    'max_depth': [10, 20],
    'min_samples_leaf': [1, 5],
}

num_fold = 5

f1_scorer = make_scorer(f1_score)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, cv=num_fold, n_jobs=-1, verbose=2)

grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters found by GridSearchCV
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")
best_model = grid_search.best_estimator_

# Evaluate on the Test Set
y_pred = best_model.predict(X_test_scaled)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nTest Set Evaluation:")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy Score: {accuracy:.2f}")

# Plot feature importances
feature_importances = best_model.feature_importances_

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=X_train_scaled.columns)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
