# Credit Card Fraud Dataset Cleaning
This notebook performs data cleaning steps suitable for ML and forecasting.

In [6]:
import pandas as pd
# Load the dataset
df = pd.read_csv("/content/credit_card_fraud.csv")
print(df)
df.head()

            Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.098698  0.363787  ... -0.01830

In [7]:
# Step 1: Remove duplicates
df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")

Shape after removing duplicates: (283726, 31)


In [8]:
# Step 2: Check for missing/null values
missing = df.isnull().sum().sum()
print(f"Total missing values: {missing}")

Total missing values: 0


In [9]:
# Step 3: Remove outliers using IQR
def remove_outliers_iqr(dataframe, columns):
    for col in columns:
        Q1 = dataframe[col].quantile(0.25)
        Q3 = dataframe[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        dataframe = dataframe[(dataframe[col] >= lower) & (dataframe[col] <= upper)]
    return dataframe

df = remove_outliers_iqr(df, ['Amount', 'Time'])
print(f"Shape after outlier removal: {df.shape}")


Shape after outlier removal: (252041, 31)


In [10]:

# Step 4: Save cleaned dataset
df.to_csv("credit_card_fraud_cleaned.csv", index=False)
print("Cleaned dataset saved as 'credit_card_fraud_cleaned.csv'")


Cleaned dataset saved as 'credit_card_fraud_cleaned.csv'


In [11]:
cdf = pd.read_csv("/content/credit_card_fraud_cleaned.csv")
print(cdf)
cdf.head()

            Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
3            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
4            2.0  -0.425966   0.960523  1.141109 -0.168252  0.420987   
...          ...        ...        ...       ...       ...       ...   
252036  172785.0   0.120316   0.931005 -0.546012 -0.745097  1.130314   
252037  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
252038  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
252039  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
252040  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.098698  0.363787  ... -0.01830

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
3,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
4,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
