# Data Cleaning – Credit Card Churn Dataset
This notebook performs initial data cleaning on the raw credit card churn dataset.  
The goal is to prepare the dataset for EDA and modeling by:
- Removing duplicates
- Handling missing values
- Fixing data types
- Addressing outliers
- Managing high-cardinality categorical features  
The cleaned dataset will be saved in `data/processed/` for use in later stages.

In [24]:
# Standard libraries
import pandas as pd
import numpy as np
from pathlib import Path
import os

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Reproducibility
np.random.seed(42)

In [25]:
# Paths
DATA_DIR = Path("../../data/raw")
FILE_PATH = DATA_DIR / "credit_card_attrition_dataset_mark.csv" 

In [26]:
# Load
df = pd.read_csv(FILE_PATH)

## 1. Looking at the Dataset

In [22]:
df.shape

(101000, 63)

In [27]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Income,CreditLimit,TotalTransactions,TotalSpend,Tenure,MaritalStatus,EducationLevel,CardType,Country,AttritionFlag,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Feature_11,Feature_12,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Feature_25,Feature_26,Feature_27,Feature_28,Feature_29,Feature_30,Feature_31,Feature_32,Feature_33,Feature_34,Feature_35,Feature_36,Feature_37,Feature_38,Feature_39,Feature_40,Feature_41,Feature_42,Feature_43,Feature_44,Feature_45,Feature_46,Feature_47,Feature_48,Feature_49
0,CUST047573,45,Female,112955.763499,18089.293599,102,7057.96176,3,Divorced,Master,Platinum,Country_36,0,-0.434767,0.119666,-0.318213,0.640795,0.274335,0.206611,0.771154,-0.307925,1.084278,0.006474,1.049323,0.273919,-0.435212,0.464209,0.818723,-0.618114,1.514308,-1.089611,1.774531,0.07106,-0.920471,1.853052,1.995785,-1.310904,-1.246411,0.614533,-1.254153,0.858942,-0.17908,-0.80715,2.178751,0.597848,-0.528956,1.184405,-0.309697,-0.737361,0.597346,-0.742293,0.726445,-1.10061,-1.79239,1.157889,1.735477,-0.240003,0.767626,0.242654,0.084097,1.035898,-1.450343,0.083688
1,CUST006615,44,Female,42980.787139,17317.747673,94,8101.572278,9,Divorced,PhD,Gold,Country_97,0,0.538034,-0.258189,-0.903829,-0.460888,1.443172,1.304155,0.937114,0.155703,0.743411,1.480021,0.375655,1.064608,-1.585891,0.562793,-1.929952,-1.191992,-0.143408,0.496699,-0.338638,0.160756,-0.335619,-0.279705,-0.179002,-0.147386,0.194051,-1.913783,-1.249456,0.241449,-0.614158,-1.274391,-0.901611,-0.182641,0.841776,-0.384364,0.252687,-1.083126,0.369728,-0.220375,-0.642129,-0.685525,-1.194323,-0.083037,1.142575,1.020381,0.981074,2.00247,-0.847103,0.084373,-0.964711,-0.759402
2,CUST032313,44,Male,114584.54989,17450.444657,87,17989.977994,16,Divorced,Bachelor,Silver,Country_34,0,-0.566592,0.01433,-0.052454,-0.890392,-0.809044,-1.342568,1.807094,0.899302,0.586818,0.21386,-0.907671,-1.131695,-1.374161,-0.232484,0.030891,0.606602,-0.117879,0.21553,0.846452,1.003204,0.363728,-0.131496,0.204634,0.955006,0.220422,-0.638782,-0.211857,-0.104988,0.04972,-0.919405,1.510323,1.29918,0.245053,0.197458,-0.420686,-2.086456,-0.390145,-2.003357,-0.092568,-1.118541,0.662366,-0.135308,0.606588,-0.207029,-0.91398,0.316399,0.065882,0.253181,-0.262693,-1.645613
3,CUST008756,40,Female,,10444.914691,90,2534.813451,11,Married,Master,Gold,Country_54,0,1.278177,0.288838,-0.992037,-0.441426,-0.139551,1.008386,1.180335,-0.083533,-0.110255,-0.262637,-0.764924,-0.684805,0.11151,-0.173965,0.051776,-0.375176,0.444438,-0.829629,-0.215294,0.249391,-0.430016,-0.363777,0.650649,0.777868,-0.602814,-0.943926,-0.922181,-0.182337,-1.071135,-2.131752,-0.402764,1.364665,-0.159471,1.302598,-0.752025,-1.457173,-0.345743,0.594889,0.253113,-0.172385,0.936482,-0.404852,-0.818307,0.559377,-1.227673,-0.066525,1.502226,1.433764,-0.556853,0.227939
4,CUST043700,47,Male,74041.28672,8022.710937,106,6290.093235,4,Single,PhD,Platinum,Country_2,0,0.311488,0.109675,0.220423,-0.401206,-2.833971,-1.418391,-1.308369,-0.649136,-1.661351,0.540208,0.21044,-0.879341,0.811985,-0.752691,-1.098107,0.724498,-0.284943,0.522194,0.232143,0.100859,0.960675,-1.495624,0.199182,0.168626,-0.983714,-0.825614,-1.556191,0.793304,-0.353874,-0.434168,0.455644,0.542132,0.121676,-0.646512,1.118082,-0.504512,0.190027,0.080314,0.719031,-2.095253,0.338365,0.646012,-0.102709,0.109845,0.752232,0.810246,-2.212616,-1.19841,0.206907,0.35936


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101000 entries, 0 to 100999
Data columns (total 63 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         101000 non-null  object 
 1   Age                101000 non-null  int64  
 2   Gender             101000 non-null  object 
 3   Income             95945 non-null   float64
 4   CreditLimit        95958 non-null   float64
 5   TotalTransactions  101000 non-null  int64  
 6   TotalSpend         95949 non-null   float64
 7   Tenure             101000 non-null  int64  
 8   MaritalStatus      101000 non-null  object 
 9   EducationLevel     101000 non-null  object 
 10  CardType           101000 non-null  object 
 11  Country            101000 non-null  object 
 12  AttritionFlag      101000 non-null  int64  
 13  Feature_0          101000 non-null  float64
 14  Feature_1          101000 non-null  float64
 15  Feature_2          101000 non-null  float64
 16  Fe

In [29]:
df.describe()

Unnamed: 0,Age,Income,CreditLimit,TotalTransactions,TotalSpend,Tenure,AttritionFlag,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Feature_11,Feature_12,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Feature_21,Feature_22,Feature_23,Feature_24,Feature_25,Feature_26,Feature_27,Feature_28,Feature_29,Feature_30,Feature_31,Feature_32,Feature_33,Feature_34,Feature_35,Feature_36,Feature_37,Feature_38,Feature_39,Feature_40,Feature_41,Feature_42,Feature_43,Feature_44,Feature_45,Feature_46,Feature_47,Feature_48,Feature_49
count,101000.0,95945.0,95958.0,101000.0,95949.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0
mean,44.50995,76254.9,14980.997877,99.991386,5201.604187,10.000911,0.049653,0.004445,-0.001509,-0.000433,-0.003364,0.001091,0.003347,-0.000486,-0.001209,-5.5e-05,0.001868,0.000451,-0.000793,0.003673,0.002061,0.0016,-0.004433,0.006977,0.001224,0.001864,-0.003364,-0.00139,0.000391,-0.001782,0.003754,-0.001593,-0.000698,-0.004076,0.001845,0.001113,0.001956,0.004017,0.002574,0.003429,-0.001529,-0.001956,0.001086,-0.000883,-0.005357,-0.005749,0.005036,0.003072,-0.00343,-0.005083,0.002089,-4.4e-05,0.005287,-1.3e-05,0.002317,-0.004427,-0.000285
std,10.012043,68258.21,5007.463431,9.991337,2976.315536,5.465092,0.217229,1.001087,0.999155,0.999672,1.000221,1.000308,0.997863,0.997961,1.001292,0.998548,1.002099,0.999451,1.000412,0.996937,0.999164,1.000958,0.997082,0.998138,0.999985,0.997962,0.997884,0.998456,1.002722,1.000446,1.00137,1.001474,1.004238,1.001315,0.998026,0.995752,1.001293,1.000746,1.001084,1.002706,1.000886,1.00272,1.000312,1.000654,0.999845,1.001273,0.997777,0.999517,1.002376,1.000198,0.999677,1.001804,0.999114,0.998145,1.001604,1.004311,0.998173
min,0.0,-18277.72,-7021.069804,61.0,-4005.132671,1.0,0.0,-4.357991,-4.231399,-4.153703,-4.427714,-4.727021,-4.164801,-4.143513,-4.697303,-4.687258,-4.643908,-4.527162,-4.342673,-4.809884,-4.860939,-4.376318,-4.301017,-4.496043,-4.378182,-4.21409,-4.342635,-4.225562,-4.463381,-3.943763,-4.418059,-4.214805,-4.127572,-4.544189,-4.268144,-4.293951,-4.324396,-4.249953,-4.466161,-4.061376,-4.806442,-4.0656,-4.203655,-3.959446,-4.423069,-4.585496,-4.393201,-4.113981,-4.849129,-4.486861,-4.34273,-4.395101,-4.312835,-4.304543,-4.36101,-4.287676,-4.279205
25%,38.0,56651.76,11613.898089,93.0,3663.273783,5.0,0.0,-0.672617,-0.674875,-0.670246,-0.681486,-0.673785,-0.668619,-0.675938,-0.677606,-0.672425,-0.669203,-0.671448,-0.674292,-0.666714,-0.674289,-0.670739,-0.674491,-0.663523,-0.672147,-0.670623,-0.676225,-0.671293,-0.678259,-0.678879,-0.668729,-0.678912,-0.67937,-0.680173,-0.672904,-0.673706,-0.672666,-0.668438,-0.669711,-0.6735,-0.676807,-0.677714,-0.671245,-0.675645,-0.676689,-0.678116,-0.670749,-0.670877,-0.673067,-0.679959,-0.670084,-0.674954,-0.668517,-0.675626,-0.67642,-0.683136,-0.673436
50%,45.0,70272.28,14963.49461,100.0,5029.493748,10.0,0.0,0.003476,-0.005582,0.000868,-0.002586,0.002828,0.000973,-0.00235,-0.000201,0.002073,0.003377,-3.4e-05,-0.000657,0.004551,0.00249,0.002533,-0.008984,0.009884,0.003641,0.000224,-0.004021,-0.00214,0.001698,-0.000664,0.003619,-0.004674,-0.000726,-0.001905,0.005202,0.002783,0.002878,0.00393,0.006874,0.004525,-0.006021,-0.000472,-0.001032,0.003511,-0.004019,-0.002772,0.008375,0.000659,-0.005786,-0.006709,0.001274,-0.000748,0.008122,-0.000509,0.003317,-0.003529,-0.00077
75%,51.0,83979.86,18341.161845,107.0,6398.315669,15.0,0.0,0.675426,0.676809,0.674518,0.671248,0.679336,0.675395,0.675083,0.675722,0.672653,0.67797,0.677733,0.671869,0.677445,0.675937,0.678586,0.66702,0.677374,0.671902,0.67478,0.669912,0.670145,0.679172,0.676188,0.682381,0.669854,0.681309,0.66976,0.672519,0.673109,0.674488,0.679814,0.678309,0.680509,0.674189,0.673332,0.678693,0.67297,0.667375,0.669436,0.6761,0.674418,0.668481,0.671037,0.677454,0.679458,0.6775,0.671088,0.676976,0.675218,0.67151
max,89.0,1233744.0,36096.831719,146.0,53896.734185,19.0,1.0,4.441448,4.38135,4.253646,4.272105,4.365257,4.656243,4.150874,4.5357,4.338124,4.279053,4.014705,4.579337,4.145747,4.189683,4.779525,4.72334,4.27679,4.02728,4.478586,4.356622,4.249214,5.154168,4.193882,4.543464,4.256307,4.228267,4.42787,4.011503,4.446477,4.868863,4.254693,4.008486,4.841412,4.140857,4.457289,4.045581,4.31918,4.848005,5.230203,4.383227,4.338037,4.113717,4.101093,4.357028,4.522098,4.394473,4.278007,4.270151,4.301758,4.890074


## 2. Checking for Duplicates

In [30]:
# Count duplicates
df.duplicated().sum()

np.int64(1000)

In [31]:
# Remove duplicates
df = df.drop_duplicates()

In [32]:
df.duplicated().sum()

np.int64(0)

## 3. Checking for Missing Data

In [35]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
print(df.isna().sum())

CustomerID              0
Age                     0
Gender                  0
Income               5000
CreditLimit          5000
TotalTransactions       0
TotalSpend           5000
Tenure                  0
MaritalStatus           0
EducationLevel          0
CardType                0
Country                 0
AttritionFlag           0
Feature_0               0
Feature_1               0
Feature_2               0
Feature_3               0
Feature_4               0
Feature_5               0
Feature_6               0
Feature_7               0
Feature_8               0
Feature_9               0
Feature_10              0
Feature_11              0
Feature_12              0
Feature_13              0
Feature_14              0
Feature_15              0
Feature_16              0
Feature_17              0
Feature_18              0
Feature_19              0
Feature_20              0
Feature_21              0
Feature_22              0
Feature_23              0
Feature_24              0
Feature_25  

In [36]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')