In [None]:
# final_project_eda.ipynb

import pandas as pd

# Load all datasets
contract = pd.read_csv('contract.csv')
personal = pd.read_csv('personal.csv')
internet = pd.read_csv('internet.csv')
phone = pd.read_csv('phone.csv')

# Preview the data
display(contract.head())
display(personal.head())
display(internet.head())
display(phone.head())

# Merge all datasets on 'customerID'
df = contract.merge(personal, on='customerID', how='left') \
             .merge(internet, on='customerID', how='left') \
             .merge(phone, on='customerID', how='left')

# Create target variable: churn = 1 if EndDate != 'No', else 0
df['churn'] = df['EndDate'].apply(lambda x: 0 if x == 'No' else 1)

# Check result
display(df[['customerID', 'EndDate', 'churn']].head())

# Quick data check
print("Shape:", df.shape)
print("Churn Rate:")
display(df['churn'].value_counts(normalize=True))


In [None]:
# Missing values
missing_values = df.isnull().sum().sort_values(ascending=False)
display(missing_values[missing_values > 0])


In [None]:
internet_service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                         'TechSupport', 'StreamingTV', 'StreamingMovies']

df[internet_service_cols] = df[internet_service_cols].fillna('No')

df['MultipleLines'] = df['MultipleLines'].fillna('No')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
display(df['TotalCharges'].isna().sum())


In [None]:
df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'])
df['churn'] = df['EndDate'].apply(lambda x: 0 if x == 'No' else 1)
df_clean = df.drop(columns=['customerID', 'BeginDate', 'EndDate'])
binary_cols = ['PaperlessBilling', 'Partner', 'Dependents', 'OnlineSecurity',
               'OnlineBackup', 'DeviceProtection', 'TechSupport',
               'StreamingTV', 'StreamingMovies', 'MultipleLines']

for col in binary_cols:
    df_clean[col] = df_clean[col].map({'Yes': 1, 'No': 0}) 
    
from sklearn.preprocessing import LabelEncoder

categorical_cols = df_clean.select_dtypes(include='object').columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

display(df_clean.head())


In [None]:
for col in df_clean.columns:
    print(f"{col}: {df_clean[col].nunique()}")


In [None]:
df_clean.to_csv("cleaned_telecom_data.csv", index=False)


##  Data Cleaning Summary
- Merged all datasets using `customerID`.
- Replaced missing service-related values (due to customers not having internet or phone service) with `'No'`.
- Converted `TotalCharges` from string to float and filled missing values with `MonthlyCharges`.
- Created target column `churn`: 1 if `EndDate` is set (churned), 0 if `EndDate` is `'No'`.
- Encoded binary features (`Yes`/`No`) to `1`/`0`.
- Encoded categorical features like `Contract`, `InternetService`, and `PaymentMethod` using `LabelEncoder`.
- Final dataset contains fully numeric, clean data ready for modeling.`

##  Exploratory Data Analysis (EDA)

In [None]:
# 1. Churn Distribution
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='churn', data=df_clean)
plt.title("Churn Distribution")
plt.xticks([0,1], ['Active', 'Churned'])
plt.show()

display(df_clean['churn'].value_counts(normalize=True))


In [None]:
# 2. Churn by Contract Type
sns.countplot(data=df_clean, x='Type', hue='churn')
plt.title('Churn by Contract Type')
plt.xticks([0, 1, 2], ['Month-to-month', 'One year', 'Two year'])
plt.show()


In [None]:
# 3. Churn by Payment Method
sns.countplot(data=df_clean, x='PaymentMethod', hue='churn')
plt.title('Churn by Payment Method')
plt.xticks([0, 1, 2, 3], label_encoders['PaymentMethod'].classes_, rotation=45)
plt.show()


In [None]:
#4. Churn by Internet Service
sns.countplot(data=df_clean, x='InternetService', hue='churn')
plt.title('Churn by Internet Service')
plt.xticks([0, 1, 2], label_encoders['InternetService'].classes_)
plt.show()


In [None]:
#5. Monthly Charges vs. Churn
sns.kdeplot(data=df_clean, x='MonthlyCharges', hue='churn', fill=True)
plt.title("Monthly Charges vs Churn")
plt.show()


In [None]:
#6. Churn by Senior, Partner, Dependents
for col in ['SeniorCitizen', 'Partner', 'Dependents']:
    sns.countplot(data=df_clean, x=col, hue='churn')
    plt.title(f'Churn by {col}')
    plt.show()


In [None]:
# 7. Churn by Streaming, Security, TechSupport
cols = ['OnlineSecurity', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in cols:
    sns.countplot(data=df_clean, x=col, hue='churn')
    plt.title(f'Churn by {col}')
    plt.show()


In [None]:
# 8. Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_clean.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


## Exploratory Data Analysis (EDA)

### 1. Churn Distribution

About 26.5% of customers have churned (label `1`), while 73.5% remain active (`0`).  
This is a moderate class imbalance that should be considered during model evaluation.

*See: Churn Distribution chart*

---

### 2. Churn by Contract Type

Customers on month-to-month contracts show a significantly higher churn rate.  
Customers with one-year or two-year contracts churn far less, indicating stronger loyalty or satisfaction.

*See: Churn by Contract Type chart*

---

### 3. Churn by Payment Method

Customers paying via electronic check have the highest churn rate.  
In contrast, those using bank transfers or credit cards (automatic) exhibit significantly lower churn.  
This suggests auto-pay methods may reduce customer turnover.

*See: Churn by Payment Method chart*

---

### 4. Churn by Internet Service

Fiber optic internet users churn more than DSL users.  
Customers without internet service show the lowest churn, likely due to limited overall service use.

*See: Churn by Internet Service chart*

---

### 5. Monthly Charges vs Churn

Higher monthly charges (around $70–$100) correlate with higher churn.  
Customers with lower charges are more likely to stay.

*See: Monthly Charges vs Churn density plot*

---

### 6. Churn by Senior, Partner, Dependents

Senior citizens show a higher churn rate compared to non-seniors.  
Customers without a partner or without dependents are also more likely to churn.  
These patterns may reflect stability associated with family or life stage.

*See: Churn by SeniorCitizen, Partner, Dependents charts*

---

### 7. Churn by Streaming, Security, TechSupport

Customers using tech support and online security services have significantly lower churn.  
Streaming service usage (TV and movies) also correlates with lower churn, though less strongly.  
These services may increase customer engagement and perceived value.

*See: Churn by OnlineSecurity, TechSupport, StreamingTV, StreamingMovies charts*

---

### 8. Correlation Matrix

Contract type shows the strongest negative correlation with churn (-0.40), meaning longer contracts reduce churn.  
Other negatively correlated features include TechSupport, OnlineSecurity, and DeviceProtection.  
MonthlyCharges has a slight positive correlation with churn, supporting earlier visual observations.

*See: Correlation Heatmap*

---

## Key Insights for Modeling and Strategy

- Month-to-month contract holders and electronic check users are at highest churn risk.
- Long-term contracts and automatic payments (e.g., credit card, bank transfer) reduce churn.
- Engagement with tech support, security services, and streaming features lowers churn.
- Demographic indicators (e.g., no dependents or partner) slightly increase churn likelihood.
- Targeted retention strategies should focus on high-risk profiles identified above.


In [None]:
for col in service_cols:
    X_train[col] = X_train[col].fillna(0)
    X_test[col] = X_test[col].fillna(0)


In [None]:
#Data is split
X_train, X_test, y_train, y_test


In [None]:
#Baseline Modeling Cell
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)

    print(f"{model.__class__.__name__}")
    print(f"AUC-ROC:  {auc:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print('-' * 30)
    return auc, acc

# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
tree = DecisionTreeClassifier(random_state=42)
forest = RandomForestClassifier(random_state=42)

# Run evaluations
evaluate_model(log_reg, X_train, X_test, y_train, y_test)
evaluate_model(tree, X_train, X_test, y_train, y_test)
evaluate_model(forest, X_train, X_test, y_train, y_test)


## Baseline Model Evaluation

To establish a performance benchmark, three classification models were trained and evaluated using the cleaned and preprocessed dataset:

- Logistic Regression
- Decision Tree Classifier
- Random Forest Classifier

The evaluation was based on two metrics:
- **Primary Metric:** AUC-ROC
- **Secondary Metric:** Accuracy

### Model Performance Summary

| Model                  | AUC-ROC | Accuracy |
|------------------------|---------|----------|
| Logistic Regression    | 0.8235  | 0.7871   |
| Decision Tree          | 0.6569  | 0.7346   |
| Random Forest          | 0.8218  | 0.7850   |

### Interpretation

- **Logistic Regression** and **Random Forest** performed best with AUC-ROC values above 0.82.
- The **Decision Tree** model underperformed, likely due to overfitting or lack of regularization.
- Based on the AUC-ROC thresholds provided in the scoring rubric, both Logistic Regression and Random Forest currently achieve **4.5 story points**.

The next step will involve tuning and experimenting with more advanced models (e.g., LightGBM, CatBoost) to aim for an AUC-ROC ≥ 0.85 for a higher score.


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5]
}

# Set up the grid search
grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_rf.fit(X_train, y_train)

# Best model
best_rf = grid_rf.best_estimator_

# Evaluate
evaluate_model(best_rf, X_train, X_test, y_train, y_test)


## Random Forest Model Tuning

After establishing baseline performance, the Random Forest model was further optimized using `GridSearchCV` to identify the best combination of hyperparameters. The following parameters were tested:

- `n_estimators`: [100, 200]
- `max_depth`: [5, 10, 15]
- `min_samples_split`: [2, 5]

The model was evaluated using 3-fold cross-validation with AUC-ROC as the scoring metric.

### Tuned Random Forest Results

| Metric     | Score   |
|------------|---------|
| AUC-ROC    | 0.8381  |
| Accuracy   | 78.9%   |

The optimized Random Forest model demonstrated improved AUC-ROC performance compared to the baseline. According to the project scoring rubric, this places the model in the **4.5 story point tier**, just below the threshold required for 5 points (AUC-ROC ≥ 0.85).

---

## Next Step: Gradient Boosting Experiments

To further improve model performance, gradient boosting algorithms will be evaluated:

- **CatBoost**: An efficient boosting algorithm designed for categorical data and minimal preprocessing.
- **LightGBM**: A fast, high-performance gradient boosting framework optimized for speed and memory usage.

Both models will be trained using default parameters as a starting point, and their performance will be evaluated using the same metrics (AUC-ROC and accuracy).


In [None]:
!pip install catboost


In [3]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(verbose=0, random_state=42)
cat.fit(X_train, y_train)

evaluate_model(cat, X_train, X_test, y_train, y_test)


ModuleNotFoundError: No module named 'catboost'