# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('../')
from src.utils import *

In [2]:

df=pd.read_csv("../data/processed/cleaned_subscriptions_churn.csv")
df.head()

Unnamed: 0,customer_id,gender,months_subscribed,streaming_quality,subscription_type,payment_mode,monthly_plan_cost,total_revenue,subscription_canceled,app_usage_hours,last30d_usage_hours,customer_rating,promo_email_clicks,device_type,num_profiles,auto_renew,support_tickets_last6m,nps_score,is_active_last30d
0,7590-VHVEG,Female,1,HD,Month-to-month,Electronic check,29.85,29.85,0,4.1,125.9,3,3,Mobile,1,1,0,9,1
1,5575-GNVDE,Male,34,HD,One year,Mailed check,56.95,1889.5,0,5.1,177.9,4,2,Mobile,1,0,0,7,1
2,3668-QPYBK,Male,2,HD,Month-to-month,Mailed check,53.85,108.15,1,2.4,88.5,5,4,Desktop,1,1,0,1,1
3,7795-CFOCW,Male,45,HD,One year,Bank transfer (automatic),42.3,1840.75,0,3.8,124.0,5,5,SmartTV,1,0,0,6,1
4,9237-HQITU,Female,2,4K,Month-to-month,Electronic check,70.7,151.65,1,6.3,182.8,5,2,Mobile,2,1,0,3,1


In [3]:
# Saperation of Numerical and Categorical.

num_data= df.select_dtypes(include=[np.number])
num_col=list(num_data.columns)

cat_data= df.select_dtypes(include=[object]) #since numpy has removed the np.objects and pandas can inbult detect the categorical/text as object, hence we used "object" instead of "np.object"
cat_col=list(cat_data.columns)

In [4]:
#MISSSING VALUES Treatment
df.isnull().sum()
# since the cleaned dataset has no missing values or already treated out hence we dont have to work on missing value treatment

customer_id               0
gender                    0
months_subscribed         0
streaming_quality         0
subscription_type         0
payment_mode              0
monthly_plan_cost         0
total_revenue             0
subscription_canceled     0
app_usage_hours           0
last30d_usage_hours       0
customer_rating           0
promo_email_clicks        0
device_type               0
num_profiles              0
auto_renew                0
support_tickets_last6m    0
nps_score                 0
is_active_last30d         0
dtype: int64

In [5]:
#Dealing with Categorical Columns: 
cat_col

# ------------------------------
# 1. Drop customer_id (unique identifier, not a feature)
# ------------------------------
df = df.drop(columns=['customer_id'])

# ------------------------------
# 2. Label / Ordinal Encoding for ordered categories
# ------------------------------
# subscription_type: Month-to-month < One year < Two year
subscription_order = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
df['subscription_type'] = df['subscription_type'].map(subscription_order)

# streaming_quality: SD < HD < 4K (example)
streaming_order = {'SD': 0, 'HD': 1, '4K': 2}  
df['streaming_quality'] = df['streaming_quality'].map(streaming_order)

# ------------------------------
# 3. One-Hot Encoding for nominal categories
# ------------------------------
one_hot_cols = ['gender', 'payment_mode', 'device_type']
df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)
for col in df.select_dtypes('bool').columns:
    df[col] = df[col].astype(int)
# ------------------------------
# 4. Verify
# ------------------------------

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   months_subscribed                     7043 non-null   int64  
 1   streaming_quality                     7043 non-null   int64  
 2   subscription_type                     7043 non-null   int64  
 3   monthly_plan_cost                     7043 non-null   float64
 4   total_revenue                         7043 non-null   float64
 5   subscription_canceled                 7043 non-null   int64  
 6   app_usage_hours                       7043 non-null   float64
 7   last30d_usage_hours                   7043 non-null   float64
 8   customer_rating                       7043 non-null   int64  
 9   promo_email_clicks                    7043 non-null   int64  
 10  num_profiles                          7043 non-null   int64  
 11  auto_renew       

Unnamed: 0,months_subscribed,streaming_quality,subscription_type,monthly_plan_cost,total_revenue,subscription_canceled,app_usage_hours,last30d_usage_hours,customer_rating,promo_email_clicks,...,auto_renew,support_tickets_last6m,nps_score,is_active_last30d,gender_Male,payment_mode_Credit card (automatic),payment_mode_Electronic check,payment_mode_Mailed check,device_type_Mobile,device_type_SmartTV
0,1,1,0,29.85,29.85,0,4.1,125.9,3,3,...,1,0,9,1,0,0,1,0,1,0
1,34,1,1,56.95,1889.5,0,5.1,177.9,4,2,...,0,0,7,1,1,0,0,1,1,0
2,2,1,0,53.85,108.15,1,2.4,88.5,5,4,...,1,0,1,1,1,0,0,1,0,0
3,45,1,1,42.3,1840.75,0,3.8,124.0,5,5,...,0,0,6,1,1,0,0,0,0,1
4,2,2,0,70.7,151.65,1,6.3,182.8,5,2,...,1,0,3,1,0,0,1,0,1,0


In [6]:
# Derived features already created on Day 2

In [7]:
#Outliers IDENTIFICATION

#Detection - will go one by one on each numerical feature
# we will start Outliers detection by simple z-score or IQR Box plots
# Before going forward we need to check skewness first, since high skewed data sometimes would not get captured by IQR or z-score test.
sk = df.skew()

high_skew_cols = sk[round(sk,1) > 0.5].index.tolist()
low_skew_cols  = sk[round(sk,1) <= 0.5].index.tolist()
high_skew_cols.remove("subscription_canceled")
print("Columns with skewness > 0.5:")
print(high_skew_cols)

print("\nColumns with skewness ≤ 0.5:")
print(low_skew_cols)


# by EDA we get to know cols that are skewed .

def mad_outliers(s, t=3.5):
    m, mad = np.median(s), np.median(np.abs(s - np.median(s)))
    return np.abs(0.6745 * (s - m) / mad) > t if mad != 0 else np.zeros(len(s), bool)

# Detect outliers for high-skewed columns
outliers = {col: mad_outliers(df[col]) for col in high_skew_cols}

# Filter + count outliers
cols_with_outliers = {col: mask.sum() for col, mask in outliers.items() if mask.sum() > 0}
print(f"\n\n\nThe following colmns has been Flagged out by the MAD score test :\n {cols_with_outliers}")

# Hence we will do MAD score test for these and do z-score test for rest.
def z_outliers(s, t=3):
    z = (s - s.mean()) / s.std()
    return np.abs(z) > t

# Detect outliers for low-skewed columns
z_outliers_dict = {col: z_outliers(df[col]) for col in low_skew_cols}

# Filter + count outliers
low_skew_outliers = {col: mask.sum() for col, mask in z_outliers_dict.items() if mask.sum() > 0}
print(f"\n\n\nThe following colmns has been Flagged out by the Z score test :\n {low_skew_outliers}")


Columns with skewness > 0.5:
['subscription_type', 'total_revenue', 'promo_email_clicks', 'num_profiles', 'support_tickets_last6m', 'payment_mode_Credit card (automatic)', 'payment_mode_Electronic check', 'payment_mode_Mailed check', 'device_type_SmartTV']

Columns with skewness ≤ 0.5:
['months_subscribed', 'streaming_quality', 'monthly_plan_cost', 'app_usage_hours', 'last30d_usage_hours', 'customer_rating', 'auto_renew', 'nps_score', 'is_active_last30d', 'gender_Male', 'device_type_Mobile']



The following colmns has been Flagged out by the MAD score test :
 {'total_revenue': np.int64(137), 'promo_email_clicks': np.int64(25)}



The following colmns has been Flagged out by the Z score test :
 {'last30d_usage_hours': np.int64(15)}


In [8]:
#OUTLIERS Treatment:
# only Total Revenue cols outliers has to be treated 

# Apply log(1 + x) transformation to total_revenue
df['total_revenue_log'] = np.log1p(df['total_revenue'])
df.drop('total_revenue',axis=1)

Unnamed: 0,months_subscribed,streaming_quality,subscription_type,monthly_plan_cost,subscription_canceled,app_usage_hours,last30d_usage_hours,customer_rating,promo_email_clicks,num_profiles,...,support_tickets_last6m,nps_score,is_active_last30d,gender_Male,payment_mode_Credit card (automatic),payment_mode_Electronic check,payment_mode_Mailed check,device_type_Mobile,device_type_SmartTV,total_revenue_log
0,1,1,0,29.85,0,4.1,125.900000,3,3,1,...,0,9,1,0,0,1,0,1,0,3.429137
1,34,1,1,56.95,0,5.1,177.900000,4,2,1,...,0,7,1,1,0,0,1,1,0,7.544597
2,2,1,0,53.85,1,2.4,88.500000,5,4,1,...,0,1,1,1,0,0,1,0,0,4.692723
3,45,1,1,42.30,0,3.8,124.000000,5,5,1,...,0,6,1,1,0,0,0,0,1,7.518471
4,2,2,0,70.70,1,6.3,182.800000,5,2,2,...,0,3,1,0,0,1,0,1,0,5.028148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,24,1,1,84.80,0,4.1,160.071624,5,3,1,...,1,0,1,1,0,0,1,1,0,7.596643
7039,72,2,1,103.20,0,2.8,113.070996,3,6,1,...,2,9,1,0,1,0,0,1,0,8.904345
7040,11,1,0,29.60,0,1.6,52.900000,3,2,1,...,0,0,1,0,0,1,0,0,1,5.850621
7041,4,2,0,74.40,1,4.8,153.700000,3,3,2,...,1,7,1,1,0,0,1,0,1,5.728800


In [9]:
#Saving the Treated DATA

save_csv(df,'../data/processed/Treated_subscriptions_churn.csv')