# Data Collection and Preprocessing
This notebook focuses on data collection, cleaning, and preprocessing steps for the customer churn analysis.

## 1. Import Libraries

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import numpy as np

## 2. Data Collection

In [22]:
df = pd.read_csv(r'C:\Users\saife\OneDrive\Desktop\Graduation\data\raw\Customer-Churn-Records.csv')

In [23]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


## 3. Data Quality Checks

In [24]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RowNumber           10000 non-null  int64  
 1   CustomerId          10000 non-null  int64  
 2   Surname             10000 non-null  object 
 3   CreditScore         10000 non-null  int64  
 4   Geography           10000 non-null  object 
 5   Gender              10000 non-null  object 
 6   Age                 10000 non-null  int64  
 7   Tenure              10000 non-null  int64  
 8   Balance             10000 non-null  float64
 9   NumOfProducts       10000 non-null  int64  
 10  HasCrCard           10000 non-null  int64  
 11  IsActiveMember      10000 non-null  int64  
 12  EstimatedSalary     10000 non-null  float64
 13  Exited              10000 non-null  int64  
 14  Complain            10000 non-null  int64  
 15  Satisfaction Score  10000 non-null  int64  
 16  Card 

(10000, 18)

In [25]:
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

In [26]:
duplicates = df.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")


Number of Duplicate Rows: 0


In [27]:
df.isnull().sum()

CreditScore           0
Geography             0
Gender                0
Age                   0
Tenure                0
Balance               0
NumOfProducts         0
HasCrCard             0
IsActiveMember        0
EstimatedSalary       0
Exited                0
Complain              0
Satisfaction Score    0
Card Type             0
Point Earned          0
dtype: int64

In [28]:
# Check for outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers)

numeric_columns = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
for col in numeric_columns:
    print(f'Outliers in {col}: {detect_outliers(df, col)}')

Outliers in CreditScore: 15
Outliers in Age: 359
Outliers in Balance: 0
Outliers in EstimatedSalary: 0


## 4. Feature Engineering

In [29]:
# Handle division by zero for feature creation
df['BalanceToSalary'] = df.apply(lambda x: 0 if x['EstimatedSalary'] == 0 else x['Balance'] / x['EstimatedSalary'], axis=1)
df['ProductsPerTenure'] = df.apply(lambda x: x['NumOfProducts'] if x['Tenure'] == 0 else x['NumOfProducts'] / x['Tenure'], axis=1)

# Replace infinite values with 0
df.replace([np.inf, -np.inf], 0, inplace=True)

# Scale numeric features
scaler = StandardScaler()
numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary', 'BalanceToSalary', 'ProductsPerTenure']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [30]:
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)

In [31]:
df.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Point Earned,BalanceToSalary,ProductsPerTenure
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,-4.824585e-16,2.318146e-16,-1.078249e-16,-6.252776000000001e-17,1.5302,0.7055,0.5151,-2.8776980000000004e-17,0.2038,0.2044,3.0138,606.5151,4.973799e-18,1.957545e-16
std,1.00005,1.00005,1.00005,1.00005,0.581654,0.45584,0.499797,1.00005,0.402842,0.403283,1.405919,225.924839,1.00005,1.00005
min,-3.109504,-1.994969,-1.733315,-1.225848,1.0,0.0,0.0,-1.740268,0.0,0.0,1.0,119.0,-0.0358039,-0.8043445
25%,-0.6883586,-0.6600185,-0.6959818,-1.225848,1.0,0.0,0.0,-0.8535935,0.0,0.0,2.0,410.0,-0.0358039,-0.6074535
50%,0.01522218,-0.1832505,-0.004425957,0.3319639,1.0,1.0,1.0,0.001802807,0.0,0.0,3.0,605.0,-0.0289084,-0.3449322
75%,0.6981094,0.4842246,0.6871299,0.8199205,2.0,1.0,1.0,0.8572431,0.0,0.0,4.0,801.0,-0.02182812,0.3113709
max,2.063884,5.061197,1.724464,2.795323,4.0,1.0,1.0,1.7372,1.0,1.0,5.0,1000.0,97.94698,6.874403


## 5. Save Processed Data

In [33]:
df.to_csv(r'C:\Users\saife\OneDrive\Desktop\Graduation\data\processed\data_after_preprocessing.csv', index=False)