In [None]:
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Load the dataset
data = pd.read_csv('Churn_Modelling_Cleaned.csv')

In [None]:
#Check Data set info
print("\nDataset Info:")
data.info()

In [None]:
#Count misssing values
print('Missing Values Count')
print(data.isnull().sum())

print("\n Missing Values Percentage")
print((data.isnull().sum()/data.shape[0])*100)

In [None]:
# Create a copy for mean imputation
df_mean = df.copy()

# Fill CreditScore with mean
df_mean['CreditScore'] = df_mean['CreditScore'].fillna(df_mean['CreditScore'].mean())

# Fill Age with mean
df_mean['Age'] = df_mean['Age'].fillna(df_mean['Age'].mean())

# Fill EstimatedSalary with mean
df_mean['EstimatedSalary'] = df_mean['EstimatedSalary'].fillna(df_mean['EstimatedSalary'].mean())

print("After mean imputation:")
print(df_mean[['CreditScore', 'Age', 'EstimatedSalary']].isnull().sum())

In [None]:
# Create a copy for median imputation
df_median = df.copy()

# Fill Tenure with median
df_median['Tenure'] = df_median['Tenure'].fillna(df_median['Tenure'].median())

# Fill Balance with median
df_median['Balance'] = df_median['Balance'].fillna(df_median['Balance'].median())

print("After median imputation:")
print(df_median[['Tenure', 'Balance']].isnull().sum())

In [None]:
# Create a copy for mode imputation
df_mode = df.copy()

# Fill Geography with mode
df_mode['Geography'] = df_mode['Geography'].fillna(df_mode['Geography'].mode()[0])

# Fill Gender with mode
df_mode['Gender'] = df_mode['Gender'].fillna(df_mode['Gender'].mode()[0])

print("After mode imputation:")
print(df_mode[['Geography', 'Gender']].isnull().sum())

In [None]:
# Create final cleaned dataset
df_clean = df.copy()

# Numerical columns - mean imputation
df_clean['CreditScore'] = df_clean['CreditScore'].fillna(df_clean['CreditScore'].mean())
df_clean['Age'] = df_clean['Age'].fillna(df_clean['Age'].mean())
df_clean['EstimatedSalary'] = df_clean['EstimatedSalary'].fillna(df_clean['EstimatedSalary'].mean())

# Numerical columns - median imputation
df_clean['Tenure'] = df_clean['Tenure'].fillna(df_clean['Tenure'].median())
df_clean['Balance'] = df_clean['Balance'].fillna(df_clean['Balance'].median())

# Categorical columns - mode imputation
df_clean['Geography'] = df_clean['Geography'].fillna(df_clean['Geography'].mode()[0])
df_clean['Gender'] = df_clean['Gender'].fillna(df_clean['Gender'].mode()[0])

print("Final cleaned dataset - Missing values:")
print(df_clean.isnull().sum())

In [None]:
# Mathematical summary of the cleaned data
print("\nStatistical Summary:")
display(df_clean.describe())

In [None]:
# Set the visual style
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.histplot(df_clean['Age'], kde=True, color="blue").set_title('Age Distribution')

plt.subplot(1, 2, 2)
sns.histplot(df_clean['Balance'], kde=True, color="green").set_title('Balance Distribution')
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Count Plot
sns.countplot(x='Geography', data=df_clean, ax=ax[0], palette='pastel').set_title('Customers by Country')

# Pie Chart
df_clean['Gender'].value_counts().plot.pie(autopct='%1.1f%%', ax=ax[1], startangle=90, colors=['skyblue', 'pink'])
ax[1].set_title('Gender Distribution')
plt.show()

In [None]:
# Relationship: Age vs Churn
plt.figure(figsize=(8, 5))
sns.boxplot(x='Exited', y='Age', data=df_clean)
plt.title('Impact of Age on Churn (Exited)')
plt.show()

# Heatmap
plt.figure(figsize=(10, 8))
numeric_only = df_clean.select_dtypes(include=[np.number]).drop(['RowNumber', 'CustomerId'], axis=1)
sns.heatmap(numeric_only.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Save cleaned dataset
df_clean.to_csv("Churn_Modelling_Cleaned.csv", index=False)
print("Cleaned dataset saved successfully!")