In [2]:
#import libraries
import pandas as pd
import io
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
#Read the dataset from drive
df = pd.read_csv("Churn_Modelling.csv")

In [4]:
# Finding Missing Values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

Missing values per column:
RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [6]:
df = df.drop(['Surname', 'Geography','Gender'], axis=1)
#Handling Missing values
df.fillna(df.median(), inplace=True)

In [7]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Remove duplicates
df.drop_duplicates(inplace=True)
s = StandardScaler()
df1= pd.DataFrame(s.fit_transform(df))

Number of duplicate rows: 0


In [8]:
# Detecting outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Define outlier criteria
outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
outliers_sum = outliers.sum()

print("Number of outliers per column:")
print(outliers_sum)
df1.head()

Number of outliers per column:
RowNumber             0
CustomerId            0
CreditScore          15
Age                 359
Tenure                0
Balance               0
NumOfProducts        60
HasCrCard             0
IsActiveMember        0
EstimatedSalary       0
Exited             2037
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.731878,-0.783213,-0.326221,0.293517,-1.04176,-1.225848,-0.911583,0.646092,0.970243,0.021886,1.977165
1,-1.731531,-0.606534,-0.440036,0.198164,-1.387538,0.11735,-0.911583,-1.547768,0.970243,0.216534,-0.505775
2,-1.731185,-0.995885,-1.536794,0.293517,1.032908,1.333053,2.527057,0.646092,-1.03067,0.240687,1.977165
3,-1.730838,0.144767,0.501521,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.03067,-0.108918,-0.505775
4,-1.730492,0.652659,2.063884,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276,-0.505775


In [9]:

# Selecting numeric columns for normalization
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns

# Normalize the numeric columns
scaler = MinMaxScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])
X = df1.iloc[:,:-1].values
X

array([[-1.73187761, -0.78321342, -0.32622142, ...,  0.64609167,
         0.97024255,  0.02188649],
       [-1.7315312 , -0.60653412, -0.44003595, ..., -1.54776799,
         0.97024255,  0.21653375],
       [-1.73118479, -0.99588476, -1.53679418, ...,  0.64609167,
        -1.03067011,  0.2406869 ],
       ...,
       [ 1.73118479, -1.47928179,  0.60498839, ..., -1.54776799,
         0.97024255, -1.00864308],
       [ 1.7315312 , -0.11935577,  1.25683526, ...,  0.64609167,
        -1.03067011, -0.12523071],
       [ 1.73187761, -0.87055909,  1.46377078, ...,  0.64609167,
        -1.03067011, -1.07636976]])

In [10]:
# Splitting the dataset into input features (X) and output variable (y)
X = df.drop('Exited', axis=1)
y = df['Exited']


In [13]:
#splitting the data for training & Testing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [14]:
#Print the training data and testing data
print(f"X_train \n {X_train}")
print(f"Length of X_train - {len(X_train)}")
print(f"X_test \n {X_test}")
print(f"Length of X_test - {len(X_test)}")

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

X_train 
       RowNumber  CustomerId  CreditScore       Age  Tenure   Balance  \
6389   0.638964    0.487645        0.510  0.162162     0.9  0.539750   
7001   0.700170    0.388629        0.662  0.405405     0.5  0.556857   
299    0.029903    0.983199        0.554  0.202703     0.5  0.000000   
6816   0.681668    0.386185        0.418  0.432432     0.5  0.648480   
7375   0.737574    0.292693        0.578  0.797297     0.6  0.322545   
...         ...         ...          ...       ...     ...       ...   
6025   0.602560    0.576889        0.502  0.378378     0.3  0.391405   
2466   0.246625    0.616459        0.332  0.189189     0.3  0.578586   
1196   0.119612    0.693751        0.736  0.135135     0.4  0.261633   
3201   0.320132    0.287581        0.744  0.027027     0.6  0.000000   
5146   0.514651    0.253175        1.000  0.189189     0.2  0.512750   

      NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
6389       0.000000        0.0             1.0         0.93