# Modelling
## 1. Prepare for Modelling
### 1.1. Import libraries

In [27]:
import pandas as pd

# Logistic regression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


### 1.2. Import data

In [3]:
# Substituir pelo df do feature selection 
all_data = pd.read_csv("df_clustered.csv")
all_data.set_index('Customer ID', inplace = True)

In [4]:
all_data.columns

Index(['Age', 'Under 30', 'Senior Citizen', 'Married', 'Dependents',
       'Number of Dependents', 'City', 'Referred a Friend',
       'Number of Referrals', 'Tenure in Months', 'Tenure Category',
       'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Avg Monthly GB Download', 'Online Security',
       'Online Backup', 'Device Protection Plan', 'Premium Tech Support',
       'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data',
       'Paperless Billing', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Satisfaction Score', 'New Customer', 'Churn Label',
       'Churn Score', 'CLTV', 'Churn Reason', 'Population', 'Engagement Score',
       'Gender_Male', 'Offer_Offer A', 'Offer_Offer B', 'Offer_Offer C',
       'Offer_Offer D', 'Offer_Offer E', 'Internet Type_Cable',
       'Internet Type_DSL', 'Internet Type_Fiber Optic',

In [5]:
# Drop irrelevant variables (cluster) 
columns_to_remove=[col for col in all_data.columns if col.startswith('Cluster_')]
all_data.drop(columns=columns_to_remove, inplace=True, errors='ignore')

### 1.3. Feature Selection

In [6]:
scenario1 = all_data[['Senior Citizen','Dependents','Referred a Friend','Internet Service','Internet Type_Fiber Optic','Online Security','Offer_Offer E',
                     'Offer_Offer A','Premium Tech Support','Unlimited Data','Contract_Month-to-Month','Paperless Billing','Payment Method_Credit Card', 'Churn Label']]

### 1.4. Data split in predictors (X) and target variable (y)

In [20]:
# Define the scenario to test
df = scenario1.copy()

# Split
X = df.drop('Churn Label', axis=1)
y = df['Churn Label']


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Baseline Model (Logistic Regression)
> Logistic Regression:
Simple and interpretable.
Provides probabilities for class membership.

In [22]:
# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [24]:
#Make predictions 
y_pred = model.predict(X_test)

In [26]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')



Accuracy: 0.8097941802696949
Confusion Matrix:
[[883 126]
 [142 258]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1009
           1       0.67      0.65      0.66       400

    accuracy                           0.81      1409
   macro avg       0.77      0.76      0.76      1409
weighted avg       0.81      0.81      0.81      1409



## 3. Baseline Model (Logistic Regression)

> Decision Trees:
Intuitive and easy to understand.
Can capture complex relationships in the data.

> Random Forest:
Ensemble method built on decision trees.
Generally more robust and accurate than individual trees.

> Support Vector Machines (SVM):
Effective in high-dimensional spaces.
Works well when there is a clear margin of separation between classes.

> Naive Bayes:
Assumes independence between features.
Fast and can perform well on certain types of data.

> K-Nearest Neighbors (KNN):
Instance-based learning.
Simple and easy to understand.

> Gradient Boosting (e.g., XGBoost, LightGBM, AdaBoost):
Builds a strong predictive model by combining weak models.
Often produces very accurate results.