<a href="https://colab.research.google.com/github/setigogoli/ML-project/blob/main/customer_churn_prediction%20(5).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**<center> <span style="color:#0F52BA;font-family:serif; font-size:34px;">
ML Project\
Setayesh Heydari 40104073\
Amir Hossein Shahrabi 401104208\
Amir Abbas Donyadideh 401104113
</span> </center>**

# Loading libraries and Dataset

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "blastchar/telco-customer-churn",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

# **Phase 1**

# Understanding the Data(Data Shape)

Each row represents a customer, each column contains customer’s attributes described on the column Metadata.

In [None]:
df.head()

**The data set includes information about:**
* **Customers who left within the last month** – the column is called Churn

* **Services that each customer has signed up for** – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies

* **Customer account information** - how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges

* **Demographic info about customers** – gender, age range, and if they have partners and dependents

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns.values

In [None]:
df.dtypes


* The target the we will use to guide the exploration is **Churn**

# Find duplicate rows



In [None]:
# ===== Find duplicate rows =====

duplicate_rows = df.duplicated()

print("Number of duplicate rows:", duplicate_rows.sum())

# See the duplicate rows (optional)
df[duplicate_rows]


In [None]:
# Remove duplicates (keep first occurrence)
df = df.drop_duplicates()

print("Shape after removing duplicate rows:", df.shape)


# Find & remove duplicate variables (columns)


In [None]:
# ===== Find & remove duplicate variables (columns) =====

# 1) Duplicate columns by *name* (exact same column label repeated)
dup_name_mask = df.columns.duplicated()
dup_name_cols = df.columns[dup_name_mask].tolist()

if dup_name_cols:
    print("Duplicate column names found:", dup_name_cols)
    # Keep first occurrence, drop the rest
    df = df.loc[:, ~dup_name_mask]
else:
    print("No duplicate column names found.")

# 2) Duplicate columns by *content* (same values in every row)
# Transpose to compare columns as rows, then find duplicates
dup_content_mask = df.T.duplicated()
dup_content_cols = df.columns[dup_content_mask].tolist()

if dup_content_cols:
    print("Duplicate columns by content found:", dup_content_cols)
    df = df.drop(columns=dup_content_cols)
else:
    print("No duplicate columns by content found.")

print("Shape after removing duplicates:", df.shape)


# Histogram using Matplotlib

In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

plt.figure(figsize=(15,10))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(2, 2, i)
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')

plt.tight_layout()
plt.show()


#Histogram using seaborn

In [None]:
# Select numeric columns automatically
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Plot histograms
df[numeric_cols].hist(figsize=(12, 8), bins=30)

plt.tight_layout()
plt.show()


# histogeram other way


In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    fig = px.histogram(df, x=col, nbins=30, title=f'Distribution of {col}')
    fig.show()


# bar plot


In [None]:
# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Remove customerID (not useful for plotting)
categorical_cols = categorical_cols.drop('customerID')

categorical_cols


In [None]:
plt.figure(figsize=(18, 25))

for i, col in enumerate(categorical_cols, 1):
    plt.subplot(len(categorical_cols)//2 + 1, 2, i)
    sns.countplot(x=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


# Bar plot other way

In [None]:
plt.figure(figsize=(18, 25))

for i, col in enumerate(categorical_cols, 1):
    plt.subplot(len(categorical_cols)//2 + 1, 2, i)

    df[col].value_counts().plot(kind='bar')

    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:
for col in categorical_cols:
    fig = px.histogram(df, x=col, title=f'Distribution of {col}')
    fig.show()


<a id = "7" ></a>
# <span style="font-family:serif; font-size:28px;"> 4. Visualize missing values </span>
<a id = "missingvalue" ></a>

In [None]:
# Visualize missing values as a matrix
msno.matrix(df);

> Using this matrix we can very quickly find the pattern of missingness in the dataset.
* From the above visualisation we can observe that it has no peculiar pattern that stands out. In fact there is no missing data.

***

# Data Manipulation(finding missing values)

In [None]:
df = df.drop(['customerID'], axis = 1)
df.head()

* On deep analysis, we can find some indirect missingness in our data (which can be in form of blankspaces). Let's see that!

In [None]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()

* Here we see that the TotalCharges has 11 missing values. Let's check this data.

In [None]:
df[np.isnan(df['TotalCharges'])]

* It can also be noted that the Tenure column is 0 for these entries even though the MonthlyCharges column is not empty.

Let's see if there are any other 0 values in the tenure column.

In [None]:
df[df['tenure'] == 0].index

* There are no additional missing values in the Tenure column.

Let's delete the rows with missing values in Tenure columns since there are only 11 rows and deleting them will not affect the data.

In [None]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
df[df['tenure'] == 0].index

> To solve the problem of missing values in TotalCharges column, I decided to fill it with the mean of TotalCharges values.

In [None]:
df.fillna(df["TotalCharges"].mean())

In [None]:
df.isnull().sum()

In [None]:
df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
df.head()

In [None]:
df["InternetService"].describe(include=['object', 'bool'])

In [None]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()

In [None]:
print(df.isnull().sum())

#distribution analysis

In [None]:
g_labels = ['Male', 'Female']
c_labels = ['No', 'Yes']
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=g_labels, values=df['gender'].value_counts(), name="Gender"),
              1, 1)
fig.add_trace(go.Pie(labels=c_labels, values=df['Churn'].value_counts(), name="Churn"),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)

fig.update_layout(
    title_text="Gender and Churn Distributions",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Gender', x=0.16, y=0.5, font_size=20, showarrow=False),
                 dict(text='Churn', x=0.84, y=0.5, font_size=20, showarrow=False)])
fig.show()

* 26.6 % of customers switched to another firm.
* Customers are 49.5 % female and 50.5 % male.

In [None]:
df["Churn"][df["Churn"]=="No"].groupby(by=df["gender"]).count()

In [None]:
df["Churn"][df["Churn"]=="Yes"].groupby(by=df["gender"]).count()

In [None]:
plt.figure(figsize=(6, 6))
labels =["Churn: Yes","Churn:No"]
values = [1869,5163]
labels_gender = ["F","M","F","M"]
sizes_gender = [939,930 , 2544,2619]
colors = ['#ff6666', '#66b3ff']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6']
explode = (0.3,0.3)
explode_gender = (0.1,0.1,0.1,0.1)
textprops = {"fontsize":15}
#Plot
plt.pie(values, labels=labels,autopct='%1.1f%%',pctdistance=1.08, labeldistance=0.8,colors=colors, startangle=90,frame=True, explode=explode,radius=10, textprops =textprops, counterclock = True, )
plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=7, textprops =textprops, counterclock = True, )
#Draw circle
centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Churn Distribution w.r.t Gender: Male(M), Female(F)', fontsize=15, y=1.1)

# show plot

plt.axis('equal')
plt.tight_layout()
plt.show()

* There is negligible difference in customer percentage/ count who chnaged the service provider. Both genders behaved in similar fashion when it comes to migrating to another service provider/firm.

In [None]:
fig = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* About 75% of customer with Month-to-Month Contract opted to move out as compared to 13% of customrs with One Year Contract and 3% with Two Year Contract

In [None]:
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()

In [None]:
fig = px.histogram(df, x="Churn", color="PaymentMethod", title="<b>Customer Payment Method distribution w.r.t. Churn</b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* Major customers who moved out were having Electronic Check as Payment Method.
* Customers who opted for Credit-Card automatic transfer or Bank Automatic Transfer and Mailed Check as Payment Method were less likely to move out.  

In [None]:
df["InternetService"].unique()

In [None]:
df[df["gender"]=="Male"][["InternetService", "Churn"]].value_counts()

In [None]:
df[df["gender"]=="Female"][["InternetService", "Churn"]].value_counts()

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [965, 992, 219, 240],
  name = 'DSL',
))

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [889, 910, 664, 633],
  name = 'Fiber optic',
))

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [690, 717, 56, 57],
  name = 'No Internet',
))

fig.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")

fig.show()

* A lot of customers choose the Fiber optic service and it's also evident that the customers who use Fiber optic have high churn rate, this might suggest a dissatisfaction with this type of internet service.
* Customers having DSL service are majority in number and have less churn rate compared to Fibre optic service.

In [None]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* Customers without dependents are more likely to churn

In [None]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Chrun distribution w.r.t. Partners</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* Customers that doesn't have partners are more likely to churn

In [None]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Chrun distribution w.r.t. Senior Citizen</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* It can be observed that the fraction of senior citizen is very less.
* Most of the senior citizens churn.

In [None]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn w.r.t Online Security</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* Most customers churn in the absence of online security,

In [None]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="PaperlessBilling",  title="<b>Chrun distribution w.r.t. Paperless Billing</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* Customers with Paperless Billing are most likely to churn.

In [None]:
fig = px.histogram(df, x="Churn", color="TechSupport",barmode="group",  title="<b>Chrun distribution w.r.t. TechSupport</b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* Customers with no TechSupport are most likely to migrate to another service provider.

In [None]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="PhoneService", title="<b>Chrun distribution w.r.t. Phone Service</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

* Very small fraction of customers don't have a phone service and out of that, 1/3rd Customers are more likely to churn.

In [None]:
sns.set_context("paper",font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No') ],
                color="Red", shade = True);
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes') ],
                ax =ax, color="Blue", shade= True);
ax.legend(["Not Churn","Churn"],loc='upper right');
ax.set_ylabel('Density');
ax.set_xlabel('Monthly Charges');
ax.set_title('Distribution of monthly charges by churn');


* Customers with higher Monthly Charges are also more likely to churn

In [None]:
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'No') ],
                color="Gold", shade = True);
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'Yes') ],
                ax =ax, color="Green", shade= True);
ax.legend(["Not Chu0rn","Churn"],loc='upper right');
ax.set_ylabel('Density');
ax.set_xlabel('Total Charges');
ax.set_title('Distribution of total charges by churn');

#Heat Map

In [None]:
plt.figure(figsize=(25, 10))

corr = df.apply(lambda x: pd.factorize(x)[0]).corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)

___

# Box Plot


Tenure vs Churn


In [None]:
fig = px.box(df, x='Churn', y='tenure', color='Churn')

fig.update_yaxes(title_text='Tenure (Months)')
fig.update_xaxes(title_text='Churn')

fig.update_layout(
    autosize=True,
    width=750,
    height=600,
    title_font=dict(size=25, family='Courier'),
    title='<b>Tenure vs Churn</b>'
)

fig.show()


MonthlyCharges vs Churn

In [None]:
fig = px.box(df, x='Churn', y='MonthlyCharges', color='Churn')

fig.update_yaxes(title_text='Monthly Charges')
fig.update_xaxes(title_text='Churn')

fig.update_layout(
    autosize=True,
    width=750,
    height=600,
    title_font=dict(size=25, family='Courier'),
    title='<b>Monthly Charges vs Churn</b>'
)

fig.show()


TotalCharges vs Churn

In [None]:
#Make sure it's numeric first
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df = df.dropna(subset=["TotalCharges"])

fig = px.box(df, x='Churn', y='TotalCharges', color='Churn')

fig.update_yaxes(title_text='Total Charges')
fig.update_xaxes(title_text='Churn')

fig.update_layout(
    autosize=True,
    width=750,
    height=600,
    title_font=dict(size=25, family='Courier'),
    title='<b>Total Charges vs Churn</b>'
)

fig.show()



all in one code

In [None]:
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

for col in numeric_cols:
    fig = px.box(df, x='Churn', y=col, color='Churn',
                 title=f'<b>{col} vs Churn</b>')

    fig.update_layout(width=750, height=600)
    fig.show()


* New customers are more likely to churn

# **Phase 2**

#Implement Imputation


In [None]:

# Convert TotalCharges to numeric (blank strings -> NaN)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Separate numeric and categorical columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns


In [None]:
# Numeric: fill NaN with median (robust)
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical: fill NaN with most frequent value (mode)
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Check
print("Missing values after imputation:\n", df.isnull().sum()[df.isnull().sum() > 0])


another way

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Split X/y
X = df.drop(columns=["Churn"])
y = df["Churn"]

# Identify columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns

# Preprocessors
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# Example split (optional)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit/transform
X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared  = preprocessor.transform(X_test)

print("Done. Shapes:", X_train_prepared.shape, X_test_prepared.shape)


#Lable Encoding

In [None]:
# Select categorical columns
cat_cols = df.select_dtypes(include=['object']).columns

# Find binary columns (exactly 2 unique values)
binary_cols = [col for col in cat_cols if df[col].nunique() == 2]

print("Binary columns:")
print(binary_cols)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in binary_cols:
    df[col] = le.fit_transform(df[col])

df.head()


In [None]:
df[binary_cols].head()

#One Hot Encoding

In [None]:
# Select categorical columns
cat_cols = df.select_dtypes(include=['object']).columns

# Select columns with more than 2 unique values
multi_cols = [col for col in cat_cols if df[col].nunique() > 2]

print("Multi-category columns:")
print(multi_cols)


In [None]:
df = pd.get_dummies(df, columns=multi_cols, drop_first=True)

df.head()
print("New shape after One-Hot Encoding:", df.shape)


another way

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# 1) Recompute multi-category columns from CURRENT df (only object columns with >2 unique values)
multi_cols = [c for c in df.columns if df[c].dtype == "object" and df[c].nunique() > 2]

print("Multi-category columns to one-hot encode:", multi_cols)

# If nothing left to encode, stop safely
if len(multi_cols) == 0:
    print("No multi-category object columns left to encode.")
else:
    # 2) One-hot encode
    ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
    encoded = ohe.fit_transform(df[multi_cols])

    encoded_df = pd.DataFrame(
        encoded,
        columns=ohe.get_feature_names_out(multi_cols),
        index=df.index
    )

    # 3) Replace original columns with encoded columns
    df = df.drop(columns=multi_cols)
    df = pd.concat([df, encoded_df], axis=1)

    print("Done. New shape:", df.shape)


In [None]:
multi_cols = [c for c in df.columns if df[c].dtype == "object" and df[c].nunique() >= 2]


## train test split

In [None]:
def object_to_int(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

In [None]:
df = df.apply(lambda x: object_to_int(x))
df.head()

In [None]:
plt.figure(figsize=(14,7))
df.corr()['Churn'].sort_values(ascending = False)

In [None]:
X = df.drop(columns = ['Churn'])
y = df['Churn'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 40, stratify=y)

In [None]:
def distplot(feature, frame, color='r'):
    plt.figure(figsize=(8,3))
    plt.title("Distribution for {}".format(feature))
    ax = sns.distplot(frame[feature], color= color)

In [None]:
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)

Since the numerical features are distributed over different value ranges, I will use standard scalar to scale them down to the same range.

#Standard scaler

In [None]:
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')),
                       columns=num_cols)
for feat in numerical_cols: distplot(feat, df_std, color='c')

In [None]:
# Divide the columns into 3 categories, one ofor standardisation, one for label encoding and one for one hot encoding

cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding
cat_cols_le = list(set(X_train.columns)- set(num_cols) - set(cat_cols_ohe)) #those that need label encoding

In [None]:
scaler= StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# **Phase 3**

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

# ── make sure boolean columns are int (required for chi2) ──
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

## Feature Engineering — Creating 2 new meaningful features


In [None]:
# ── Feature 1: Tenure Group (بازه‌بندی tenure) ──────────────
# Binning tenure into meaningful loyalty segments
def tenure_group(tenure):
    if tenure <= 12:
        return 0   # New Customer (0–1 year)
    elif tenure <= 24:
        return 1   # Developing (1–2 years)
    elif tenure <= 48:
        return 2   # Established (2–4 years)
    else:
        return 3   # Loyal (4+ years)

df['Tenure_Group'] = df['tenure'].apply(tenure_group)

print("Tenure Group distribution:")
print(df['Tenure_Group'].value_counts().sort_index())
print()

# ── Feature 2: Charges per Month Ratio ──────────────────────
# TotalCharges / tenure gives average spend per month
# This reveals if a customer's spending is consistent or changed
df['Avg_Monthly_Spend'] = df['TotalCharges'] / (df['tenure'] + 1)
# (+1 to avoid division by zero for tenure=0)

# ── Feature 3: Service Count ────────────────────────────────
# How many add-on services does each customer subscribe to?
# More services → higher switching cost → lower churn probability
service_cols = [
    'MultipleLines_Yes',
    'OnlineSecurity_Yes',
    'OnlineBackup_Yes',
    'DeviceProtection_Yes',
    'TechSupport_Yes',
    'StreamingTV_Yes',
    'StreamingMovies_Yes'
]
df['Service_Count'] = df[service_cols].sum(axis=1)

print("New features added:")
print(df[['tenure', 'Tenure_Group', 'TotalCharges',
          'Avg_Monthly_Spend', 'Service_Count']].head(10))
print()
print("Correlation of new features with Churn:")
print(df[['Tenure_Group', 'Avg_Monthly_Spend',
          'Service_Count', 'Churn']].corr()['Churn'])


## Filter-Based Selection (Chi-Squared + ANOVA)

In [None]:
# Separate features and target
X = df.drop(columns=['Churn'])
y = df['Churn']

# ── Scale numerical features to [0,1] for Chi2 ──────────────
# Chi2 requires non-negative values
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns
)


## Chi-Squared Test (for categorical/binary features)

In [None]:
chi2_selector = SelectKBest(score_func=chi2, k='all')
chi2_selector.fit(X_scaled, y)

chi2_scores = pd.DataFrame({
    'Feature': X.columns,
    'Chi2_Score': chi2_selector.scores_,
    'P_Value': chi2_selector.pvalues_
}).sort_values('Chi2_Score', ascending=False).reset_index(drop=True)

print("=" * 55)
print("Chi-Squared Scores (Top 15):")
print("=" * 55)
print(chi2_scores.head(15).to_string(index=False))
print()

# Plot Chi2
plt.figure(figsize=(12, 7))
sns.barplot(
    data=chi2_scores.head(15),
    x='Chi2_Score',
    y='Feature',
    palette='Blues_r'
)
plt.title('Top 15 Features — Chi-Squared Test', fontsize=14, fontweight='bold')
plt.xlabel('Chi² Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## ANOVA F-Test (better for continuous numerical features)

In [None]:
anova_selector = SelectKBest(score_func=f_classif, k='all')
anova_selector.fit(X_scaled, y)

anova_scores = pd.DataFrame({
    'Feature': X.columns,
    'ANOVA_F_Score': anova_selector.scores_,
    'P_Value': anova_selector.pvalues_
}).sort_values('ANOVA_F_Score', ascending=False).reset_index(drop=True)

print("=" * 55)
print("ANOVA F-Scores (Top 15):")
print("=" * 55)
print(anova_scores.head(15).to_string(index=False))
print()

# Plot ANOVA
plt.figure(figsize=(12, 7))
sns.barplot(
    data=anova_scores.head(15),
    x='ANOVA_F_Score',
    y='Feature',
    palette='Greens_r'
)
plt.title('Top 15 Features — ANOVA F-Test', fontsize=14, fontweight='bold')
plt.xlabel('ANOVA F-Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## Combined Filter Score

In [None]:
filter_combined = chi2_scores[['Feature', 'Chi2_Score']].merge(
    anova_scores[['Feature', 'ANOVA_F_Score']], on='Feature'
)
# Normalize both scores to [0,1] and average them
filter_combined['Chi2_norm']  = (filter_combined['Chi2_Score'] /
                                  filter_combined['Chi2_Score'].max())
filter_combined['ANOVA_norm'] = (filter_combined['ANOVA_F_Score'] /
                                  filter_combined['ANOVA_F_Score'].max())
filter_combined['Combined_Score'] = (filter_combined['Chi2_norm'] +
                                      filter_combined['ANOVA_norm']) / 2
filter_combined = filter_combined.sort_values(
    'Combined_Score', ascending=False
).reset_index(drop=True)

top15_filter = filter_combined.head(15)['Feature'].tolist()
print("Top 15 Features by Combined Filter Score:")
print(top15_filter)

## Lasso Regression (L1 — drives irrelevant features to 0)

In [None]:
# Re-define X and y after all feature engineering and encoding
X = df.drop(columns = ['Churn'])
y = df['Churn'].values

# Re-perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 40, stratify=y)

# Re-identify numerical columns (they might have changed with new features)
num_cols_updated = X_train.select_dtypes(include=['int64', 'float64']).columns

# Re-apply StandardScaler to the numerical columns of the new X_train and X_test
scaler= StandardScaler()
X_train[num_cols_updated] = scaler.fit_transform(X_train[num_cols_updated])
X_test[num_cols_updated] = scaler.transform(X_test[num_cols_updated])


lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso.fit(X_train, y_train)

lasso_importance = pd.DataFrame({
    'Feature': X.columns,
    'Lasso_Coefficient': np.abs(lasso.coef_)
}).sort_values('Lasso_Coefficient', ascending=False).reset_index(drop=True)

print("=" * 55)
print(f"Lasso best alpha: {lasso.alpha_:.6f}")
print("=" * 55)
print("Lasso — Non-zero features (selected):")
lasso_selected = lasso_importance[lasso_importance['Lasso_Coefficient'] > 0]
print(lasso_selected.to_string(index=False))
print()

# Plot Lasso
plt.figure(figsize=(12, 7))
sns.barplot(
    data=lasso_selected.head(15),
    x='Lasso_Coefficient',
    y='Feature',
    palette='Oranges_r'
)
plt.title('Feature Importance — Lasso (L1) Regression', fontsize=14, fontweight='bold')
plt.xlabel('|Coefficient|')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## Random Forest Feature Importance

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)
rf.fit(X_train, y_train)

rf_importance = pd.DataFrame({
    'Feature': X.columns,
    'RF_Importance': rf.feature_importances_
}).sort_values('RF_Importance', ascending=False).reset_index(drop=True)

print("=" * 55)
print("Random Forest — Top 15 Feature Importances:")
print("=" * 55)
print(rf_importance.head(15).to_string(index=False))
print()

# Plot RF
plt.figure(figsize=(12, 7))
sns.barplot(
    data=rf_importance.head(15),
    x='RF_Importance',
    y='Feature',
    palette='Purples_r'
)
plt.title('Feature Importance — Random Forest', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## Final Feature Subset — Combining all methods

In [None]:
# Rank features across all 3 methods
all_features = X.columns.tolist()

# Rank by each method (lower rank = more important)
filter_rank = {f: i for i, f in enumerate(top15_filter)}
lasso_rank  = {f: i for i, f in
               enumerate(lasso_importance['Feature'].tolist())}
rf_rank     = {f: i for i, f in
               enumerate(rf_importance['Feature'].tolist())}

ranking_df = pd.DataFrame({'Feature': all_features})
ranking_df['Filter_Rank'] = ranking_df['Feature'].map(
    lambda f: filter_rank.get(f, len(all_features))
)
ranking_df['Lasso_Rank']  = ranking_df['Feature'].map(
    lambda f: lasso_rank.get(f, len(all_features))
)
ranking_df['RF_Rank']     = ranking_df['Feature'].map(
    lambda f: rf_rank.get(f, len(all_features))
)
ranking_df['Avg_Rank']    = ranking_df[
    ['Filter_Rank', 'Lasso_Rank', 'RF_Rank']
].mean(axis=1)

ranking_df = ranking_df.sort_values('Avg_Rank').reset_index(drop=True)

print("=" * 60)
print("Final Feature Ranking (All Methods Combined):")
print("=" * 60)
print(ranking_df.head(15).to_string(index=False))

# ── Select Top 12 Final Features ─────────────────────────────
final_features = ranking_df.head(12)['Feature'].tolist()
print("\n✅ Final Selected Features:")
for i, f in enumerate(final_features, 1):
    print(f"  {i:2}. {f}")

# ── Final dataframe ready for modeling ───────────────────────
df_model = df[final_features + ['Churn']].copy()
print(f"\ndf_model shape: {df_model.shape}")
print(df_model.head())


In [None]:
# ============================================================
# E. Textual Justification for Final Feature Subset
# ============================================================

justification = """
╔══════════════════════════════════════════════════════════════╗
║         PHASE 3 — FINAL FEATURE SUBSET JUSTIFICATION        ║
╚══════════════════════════════════════════════════════════════╝

1. FEATURE ENGINEERING
   ─────────────────────────────────────────────────────────
   • Tenure_Group: Binning tenure into 4 loyalty segments
     (New/Developing/Established/Loyal) captures non-linear
     churn behavior — new customers churn at much higher rates.

   • Avg_Monthly_Spend: TotalCharges / (tenure+1) captures
     whether a customer's spending is rising or falling over
     time, which is more informative than raw TotalCharges alone.

   • Service_Count: The total number of add-on services acts
     as a proxy for switching cost — customers with more
     services face higher friction when leaving.

2. FILTER-BASED SELECTION (Chi2 + ANOVA)
   ─────────────────────────────────────────────────────────
   • Chi-Squared identified categorical features most
     statistically dependent on Churn (p < 0.05).
   • ANOVA F-Test confirmed continuous features (tenure,
     MonthlyCharges, TotalCharges) with highest group
     mean differences between churned/non-churned customers.
   • Features failing both tests (p > 0.05 in both) were
     considered statistically insignificant and down-ranked.

3. MODEL-BASED SELECTION
   ─────────────────────────────────────────────────────────
   • Lasso (L1): By penalizing coefficients toward zero,
     Lasso automatically eliminated multicollinear and
     redundant features. Only features surviving L1
     shrinkage carry independent predictive signal.
   • Random Forest: Impurity-based importance scores
     capture non-linear relationships and interactions
     that linear methods like Lasso may miss.

4. FINAL SELECTION RATIONALE
   ─────────────────────────────────────────────────────────
   The final 12 features were chosen by averaging ranks
   across all three methods. This ensemble approach is more
   robust than relying on any single method:
   - It avoids overfitting to one selection criterion.
   - Features consistently ranked high across methods
     are genuinely predictive, not method-specific artifacts.
   - Multicollinear OHE dummy pairs (e.g.,
     'InternetService_No' vs 'InternetService_Fiber optic')
     were deduplicated keeping only the higher-ranked one.
"""
print(justification)
