In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('merged_churn_dataset.csv')

dataset.info()
dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110000 entries, 0 to 109999
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  110000 non-null  int64  
 1   CreditScore                 110000 non-null  float64
 2   Balance                     110000 non-null  float64
 3   NumOfProducts               110000 non-null  float64
 4   HasCrCard                   110000 non-null  float64
 5   IsActiveMember              110000 non-null  float64
 6   EstimatedSalary             110000 non-null  float64
 7   Exited                      110000 non-null  float64
 8   GameGenre                   110000 non-null  object 
 9   PlayTimeHours               110000 non-null  float64
 10  InGamePurchases             110000 non-null  float64
 11  GameDifficulty              110000 non-null  object 
 12  SessionsPerWeek             110000 non-null  float64
 13  AvgSessionDura

Unnamed: 0,CustomerID,CreditScore,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,GameGenre,PlayTimeHours,...,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,15634602,619.0,0.0,1.0,1.0,1.0,101348.88,1.0,Sports,12.024365,...,Easy,9.471774,94.792252,49.655568,24.526477,Medium,12.4901,65.053197,274.39365,0.49779
1,15647311,608.0,83807.86,1.0,0.0,1.0,112542.58,0.0,Sports,12.024365,...,Easy,9.471774,94.792252,49.655568,24.526477,Medium,12.4901,65.053197,274.39365,0.49779
2,15619304,502.0,159660.8,3.0,1.0,0.0,113931.57,1.0,Sports,12.024365,...,Easy,9.471774,94.792252,49.655568,24.526477,Medium,12.4901,65.053197,274.39365,0.49779
3,15701354,699.0,0.0,2.0,0.0,0.0,93826.63,0.0,Sports,12.024365,...,Easy,9.471774,94.792252,49.655568,24.526477,Medium,12.4901,65.053197,274.39365,0.49779
4,15737888,850.0,125510.82,1.0,1.0,1.0,79084.1,0.0,Sports,12.024365,...,Easy,9.471774,94.792252,49.655568,24.526477,Medium,12.4901,65.053197,274.39365,0.49779


# Handle Missing Values

In [3]:
print(dataset.isnull().sum())

CustomerID                    0
CreditScore                   0
Balance                       0
NumOfProducts                 0
HasCrCard                     0
IsActiveMember                0
EstimatedSalary               0
Exited                        0
GameGenre                     0
PlayTimeHours                 0
InGamePurchases               0
GameDifficulty                0
SessionsPerWeek               0
AvgSessionDurationMinutes     0
PlayerLevel                   0
AchievementsUnlocked          0
EngagementLevel               0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64


In [4]:
dataset = dataset.dropna(axis=1, thresh=int(0.6 * len(dataset)))

In [5]:
dataset.fillna(dataset.select_dtypes(include=np.number).mean(), inplace=True)
dataset.fillna(dataset.mode().iloc[0], inplace=True)

# Remove Duplicates

In [6]:
dataset = dataset.drop_duplicates()
print("Duplicates removed: ", dataset.duplicated().sum())

Duplicates removed:  0


# Fix Data Types

In [7]:
if 'date_column' in dataset.columns:
    dataset['date_column'] = pd.to_datetime(dataset['date_column'], errors='coerce')
else:
    print("Warning: 'date_column' not found in the DataFrame.")



In [8]:
print(dataset.dtypes)

CustomerID                      int64
CreditScore                   float64
Balance                       float64
NumOfProducts                 float64
HasCrCard                     float64
IsActiveMember                float64
EstimatedSalary               float64
Exited                        float64
GameGenre                      object
PlayTimeHours                 float64
InGamePurchases               float64
GameDifficulty                 object
SessionsPerWeek               float64
AvgSessionDurationMinutes     float64
PlayerLevel                   float64
AchievementsUnlocked          float64
EngagementLevel                object
Subscription_Length_Months    float64
Monthly_Bill                  float64
Total_Usage_GB                float64
Churn                         float64
dtype: object


In [9]:
object_columns = dataset.select_dtypes(include=['object']).columns
print("Object Columns:", object_columns)

Object Columns: Index(['GameGenre', 'GameDifficulty', 'EngagementLevel'], dtype='object')


In [10]:
for col in object_columns:
    print(f"{col}: {dataset[col].nunique()} unique values")

GameGenre: 5 unique values
GameDifficulty: 3 unique values
EngagementLevel: 3 unique values


In [11]:
dataset[object_columns] = dataset[object_columns].astype('category')

print(dataset.dtypes)

CustomerID                       int64
CreditScore                    float64
Balance                        float64
NumOfProducts                  float64
HasCrCard                      float64
IsActiveMember                 float64
EstimatedSalary                float64
Exited                         float64
GameGenre                     category
PlayTimeHours                  float64
InGamePurchases                float64
GameDifficulty                category
SessionsPerWeek                float64
AvgSessionDurationMinutes      float64
PlayerLevel                    float64
AchievementsUnlocked           float64
EngagementLevel               category
Subscription_Length_Months     float64
Monthly_Bill                   float64
Total_Usage_GB                 float64
Churn                          float64
dtype: object


# Feature Engineering

**Time-Based**

In [12]:
dataset['tenure_years'] = dataset['Subscription_Length_Months'] / 12

**Behavioral Features**

Gaming Intensity

In [13]:
dataset['playtime_per_session'] = dataset['PlayTimeHours'] / (dataset['SessionsPerWeek'] * 4 + 1)

In-Game Spending

In [14]:
dataset['spending_per_hour'] = dataset['InGamePurchases'] / (dataset['PlayTimeHours'] + 1)

Avg Monthly Usage

In [15]:
dataset['avg_usage_per_month'] = dataset['Total_Usage_GB'] / dataset['Subscription_Length_Months']

Avg Monthly Bill

In [16]:
dataset['avg_bill_per_product'] = dataset['Monthly_Bill'] / (dataset['NumOfProducts'] + 1)

**Engagement & Retention Features**

Gaming Engagement Score

In [17]:
dataset['gaming_engagement_score'] = dataset['PlayTimeHours'] * dataset['AchievementsUnlocked']

Avg Session Length

In [18]:
dataset['avg_session_length_per_week'] = dataset['AvgSessionDurationMinutes'] * dataset['SessionsPerWeek']

Activity Score

In [19]:
dataset['activity_score'] = dataset['SessionsPerWeek'] * dataset['IsActiveMember']

**Interaction Featurest**

Credit Utilization Rate

In [20]:
dataset['credit_utilization'] = dataset['Balance'] / (dataset['EstimatedSalary'] + 1)

Spending Behavior Score

In [21]:
dataset['spending_behavior'] = dataset['InGamePurchases'] / (dataset['Monthly_Bill'] + 1)

Game Engagement Index

In [22]:
dataset['game_engagement_index'] = dataset['PlayTimeHours'] + dataset['SessionsPerWeek'] + dataset['AchievementsUnlocked']

**Aggregated Features**

Churn Rate per Game Genre

In [23]:
dataset['game_genre_churn_rate'] = dataset.groupby('GameGenre', observed=False)['Churn'].transform('mean')

Churn Rate per Engagement Level

In [24]:
dataset['engagement_churn_rate'] = dataset.groupby('EngagementLevel', observed=False)['Churn'].transform('mean')

**Verification**

In [25]:
print(dataset.head())  # Check new features
dataset.to_csv("enhanced_churn_dataset.csv", index=False)  # Save processed dataset

   CustomerID  CreditScore    Balance  NumOfProducts  HasCrCard  \
0    15634602        619.0       0.00            1.0        1.0   
1    15647311        608.0   83807.86            1.0        0.0   
2    15619304        502.0  159660.80            3.0        1.0   
3    15701354        699.0       0.00            2.0        0.0   
4    15737888        850.0  125510.82            1.0        1.0   

   IsActiveMember  EstimatedSalary  Exited GameGenre  PlayTimeHours  ...  \
0             1.0        101348.88     1.0    Sports      12.024365  ...   
1             1.0        112542.58     0.0    Sports      12.024365  ...   
2             0.0        113931.57     1.0    Sports      12.024365  ...   
3             0.0         93826.63     0.0    Sports      12.024365  ...   
4             1.0         79084.10     0.0    Sports      12.024365  ...   

   avg_usage_per_month avg_bill_per_product  gaming_engagement_score  \
0            21.968891            32.526598               294.915327

In [26]:
print(dataset.dtypes)

CustomerID                        int64
CreditScore                     float64
Balance                         float64
NumOfProducts                   float64
HasCrCard                       float64
IsActiveMember                  float64
EstimatedSalary                 float64
Exited                          float64
GameGenre                      category
PlayTimeHours                   float64
InGamePurchases                 float64
GameDifficulty                 category
SessionsPerWeek                 float64
AvgSessionDurationMinutes       float64
PlayerLevel                     float64
AchievementsUnlocked            float64
EngagementLevel                category
Subscription_Length_Months      float64
Monthly_Bill                    float64
Total_Usage_GB                  float64
Churn                           float64
tenure_years                    float64
playtime_per_session            float64
spending_per_hour               float64
avg_usage_per_month             float64


Lable Encoding

In [27]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['GameGenre', 'GameDifficulty', 'EngagementLevel']
for col in categorical_cols:
    dataset[col] = dataset[col].astype('category')

label_encoders = {}  # Store encoders for inverse transformation
ordinal_cols = ['GameDifficulty', 'EngagementLevel']

for col in ordinal_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])
    label_encoders[col] = le

dataset = pd.get_dummies(dataset, columns=['GameGenre'], drop_first=True)

In [28]:
print(dataset.info())  # Verify new columns
print(dataset.head())  # View transformed data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110000 entries, 0 to 109999
Data columns (total 37 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   CustomerID                   110000 non-null  int64  
 1   CreditScore                  110000 non-null  float64
 2   Balance                      110000 non-null  float64
 3   NumOfProducts                110000 non-null  float64
 4   HasCrCard                    110000 non-null  float64
 5   IsActiveMember               110000 non-null  float64
 6   EstimatedSalary              110000 non-null  float64
 7   Exited                       110000 non-null  float64
 8   PlayTimeHours                110000 non-null  float64
 9   InGamePurchases              110000 non-null  float64
 10  GameDifficulty               110000 non-null  int64  
 11  SessionsPerWeek              110000 non-null  float64
 12  AvgSessionDurationMinutes    110000 non-null  float64
 13 

# Feature Scaling

Identify Numerical Columns

In [29]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

numerical_cols = dataset.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols.remove('Churn')

Normalization

In [30]:
scaler = MinMaxScaler()
dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])

# Check scaled values
print(dataset.head())

   CustomerID  CreditScore   Balance  NumOfProducts  HasCrCard  \
0    0.988550        0.538  0.000000       0.000000        1.0   
1    0.989354        0.516  0.334031       0.000000        0.0   
2    0.987583        0.304  0.636357       0.666667        1.0   
3    0.992771        0.698  0.000000       0.333333        0.0   
4    0.995081        1.000  0.500246       0.000000        1.0   

   IsActiveMember  EstimatedSalary  Exited  PlayTimeHours  InGamePurchases  \
0             1.0         0.506735     1.0       0.501021         0.200854   
1             1.0         0.562709     0.0       0.501021         0.200854   
2             0.0         0.569654     1.0       0.501021         0.200854   
3             0.0         0.469120     0.0       0.501021         0.200854   
4             1.0         0.395400     0.0       0.501021         0.200854   

   ...  activity_score  credit_utilization  spending_behavior  \
0  ...        0.967801            0.000000           0.094295   
1  .

 Standardization

In [31]:
scaler = StandardScaler()
dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])

# Check standardized values
print(dataset.head())

   CustomerID  CreditScore   Balance  NumOfProducts  HasCrCard  \
0    3.149652    -1.081954 -4.065677      -3.023380   2.142844   
1    3.152479    -1.459434  0.389206      -3.023380  -5.133366   
2    3.146250    -5.096970  4.421238       8.381299   2.142844   
3    3.164497     1.663356 -4.065677       2.678959  -5.133366   
4    3.172622     6.845128  2.605965      -3.023380   2.142844   

   IsActiveMember  EstimatedSalary    Exited  PlayTimeHours  InGamePurchases  \
0        3.217930         0.072589  6.557513  -2.890867e-10     1.999495e-10   
1        3.217930         0.718161 -1.677465  -2.890867e-10     1.999495e-10   
2       -3.418346         0.798268  6.557513  -2.890867e-10     1.999495e-10   
3       -3.418346        -0.361240 -1.677465  -2.890867e-10     1.999495e-10   
4        3.217930        -1.211483 -1.677465  -2.890867e-10     1.999495e-10   

   ...  activity_score  credit_utilization  spending_behavior  \
0  ...        2.005464           -0.034436           -0.0

In [32]:
dataset[['GameGenre_RPG','GameGenre_Simulation', 'GameGenre_Sports', 'GameGenre_Strategy']] = dataset[
    ['GameGenre_RPG','GameGenre_Simulation', 'GameGenre_Sports', 'GameGenre_Strategy']
].astype(int)

print(dataset[['GameGenre_RPG','GameGenre_Simulation', 'GameGenre_Sports', 'GameGenre_Strategy']].head())


   GameGenre_RPG  GameGenre_Simulation  GameGenre_Sports  GameGenre_Strategy
0              0                     0                 1                   0
1              0                     0                 1                   0
2              0                     0                 1                   0
3              0                     0                 1                   0
4              0                     0                 1                   0


In [33]:
print(dataset.describe())
print(dataset.head())


         CustomerID   CreditScore       Balance  NumOfProducts     HasCrCard  \
count  1.100000e+05  1.100000e+05  1.100000e+05   1.100000e+05  1.100000e+05   
mean  -4.134067e-17 -1.627789e-17  3.217251e-16  -5.176512e-16 -2.325413e-18   
std    1.000005e+00  1.000005e+00  1.000005e+00   1.000005e+00  1.000005e+00   
min   -3.273376e-01 -1.031306e+01 -4.065677e+00  -3.023380e+00 -5.133366e+00   
25%   -3.212219e-01  0.000000e+00  9.665092e-12  -4.748151e-16  0.000000e+00   
50%   -3.151062e-01  0.000000e+00  9.665092e-12  -4.748151e-16  0.000000e+00   
75%   -3.089905e-01  0.000000e+00  9.665092e-12  -4.748151e-16  0.000000e+00   
max    3.189925e+00  6.845128e+00  9.271039e+00   1.408364e+01  2.142844e+00   

       IsActiveMember  EstimatedSalary        Exited  PlayTimeHours  \
count    1.100000e+05     1.100000e+05  1.100000e+05   1.100000e+05   
mean    -1.461679e-15     1.420402e-15 -2.325413e-18  -1.543481e-15   
std      1.000005e+00     1.000005e+00  1.000005e+00   1.000005e+0

In [34]:
dataset.to_csv("new_churn_dataset.csv", index=False)

# Splitting Of Dataset

Feature and Target

In [52]:
X = dataset.drop(columns=['Churn'])
y = dataset['Churn']

Splitting into training and testing sets

In [53]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [42]:
print(X_train.shape)

(88000, 36)


In [44]:
print(y_train.shape)

(88000,)


In [46]:
print(X_test.shape)

(22000, 36)


In [47]:
print(y_test.shape)

(22000,)
