In [None]:
import pandas as pd;
import numpy as np;
import matplotlib.pyplot as plt

# 🧭 Step 1: Understand the Data You’re Working With
Hint: Before doing anything else, load the dataset and try to get a feel of the data.
Ask questions like:

1. What columns are there?

2. What types of values do they hold?

3. Any weird values you notice right away?

4. Are there NaNs?

In [None]:
df=pd.read_csv('dirty_credit_card_fraud_dataset.csv')

In [None]:
df1=df.copy()

In [None]:
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['IsFraud'].value_counts()

In [None]:
df['Merchant'].value_counts()

# 🧭 Step 2: Get Specific About the Missing Data
Hint:
Start answering questions like:

1. How many missing values in each column?

2. What percentage of values are missing?

3. Which columns are the dirtiest?

4. Are there rows where too many things are missing?

In [None]:
(df.isna().mean() * 100)

In [None]:
## it says the rows with maximum missing columns
df.isnull().sum(axis=1).sort_values(ascending=False).head(10)

# 🧭 Step 3: Plan Your Cleaning Strategy (Column-Wise)
Hint:
Now that you know which columns and rows are messy — ask yourself:

🔍 For each dirty column:
1. Is it important for ML? (e.g., TransactionAmount, CardType)

2.  Can missing values be filled?

    Mean/median for numerical

    Mode for categorical

    "Unknown"/"Other" if the category isn't important

    Or should it be dropped? (if too many missing or not useful)

3. 🔍 For dirty rows:
    If only 1–2 columns are missing, maybe fill them.

    If 4–5+ columns are missing, maybe drop the row.

In [None]:
## TRANSACTION AMOUNT

df['TransactionAmount'].isnull().sum()

In [None]:
df['TransactionAmount'].describe()

In [None]:
df[df['TransactionAmount']>700].count()
## it means there are outliers also

In [None]:
## we fill missing values with median due to outliers
median_value = df['TransactionAmount'].median()
df['TransactionAmount'].fillna(median_value, inplace=True)

In [None]:
## Transaction Type

df['TransactionType'].value_counts()


In [None]:
#change ATM to atm
#change swpie to swipe
#change Onlne to online

df['TransactionType'] = df['TransactionType'].replace('ATM', 'atm')
df['TransactionType'] = df['TransactionType'].replace('swpie', 'swipe')
df['TransactionType'] = df['TransactionType'].replace('Onlne', 'online')

In [None]:
df['TransactionType'].value_counts()

In [None]:
df['TransactionType']=df['TransactionType'].fillna('missing')

In [None]:
df['CardType'].value_counts()

In [None]:
## change all names in credit and debit

df['CardType'] = df['CardType'].replace(['Credit','credt'], 'online')
df['CardType'] = df['CardType'].replace(['Debit','dbit'], 'debit')

In [None]:
df['CardType'].value_counts()


In [None]:
df['CardType'].value_counts()

In [None]:
df['CardType'].isna().sum()

In [None]:
df['CardType'].value_counts()

In [None]:
df['CardType']=df['CardType'].fillna(df['CardType'].mode()[0])

In [None]:
df['CardType'].value_counts()

In [None]:
df.head()

In [None]:
df['Merchant'].isna().sum()

In [None]:
df.fillna({'Merchant':'unknown_merchant'},inplace=True)

# df['Merchant']=df['Merchant'].fillna('unknown_merchant')

In [None]:
df.head()

In [None]:
df['DeviceType'].value_counts()

In [None]:
# change some names

df['DeviceType']=df['DeviceType'].replace('Mobile','mobile')

df['DeviceType']=df['DeviceType'].replace('dsk','disk')

In [None]:
df['DeviceType'].value_counts()

In [None]:
(df.isna().mean() * 100)

In [None]:
df['TransactionType'].mode()

In [None]:
df['Location'].value_counts()

In [None]:
from scipy.stats import entropy

counts = df['Location'].value_counts(normalize=True)
entropy_val = entropy(counts, base=2)
print("Entropy:", entropy_val)


In [None]:
df2=df.copy()

In [None]:
df['DeviceType'].value_counts()

In [None]:
## filling missing values ith other 

df['DeviceType']=df['DeviceType'].fillna('Other')

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
### FETCH YEAR AND MONTH FROM TIMESTAMP

# Convert the 'Timestamp' column to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Extract Year and Month
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month



In [None]:
df['Time'] = df['Timestamp'].dt.strftime('%H:%M:%S')

In [None]:
df.drop(columns=['Timestamp'],inplace=True)

In [None]:
df

In [None]:
df.drop(columns=['Location'],inplace=True)

In [None]:
cleaned=df.copy()

In [None]:
df.info()


## :) HERE WE GET OUR CLEANED DATASET WITHOUT MISSING VALUES

In [None]:
cleaned.head()

# Histogram
- It shows that Transaction Amount has mostly higher values
- This is baised data because IsFraud have column have mostly 0 value

In [None]:
df.hist(figsize=(15, 15))
plt.show()

# Boxplot (Finding Outliers and Spread)
- Show the spread (minimum, maximum, median, quartiles).

- Show outliers (dots outside the box).
- 📍 Small box → Feature values are very close together (low variance).

- 📍 Big box → Feature values are spread out (high variance).

- 📍 Many dots outside → Feature has many outliers.

In [None]:
import seaborn as sns

In [None]:
# Boxplot for all TransactionAmount features
plt.figure(figsize=(20, 10))
sns.boxplot(data=df['TransactionAmount'])
plt.xticks(rotation=90)
plt.title('Boxplot for each feature')
plt.show()

#  Correlation Matrix (Heatmap)
- Calculate correlation between all features.

- Plot a colorful heatmap where:

- - +1 → Strong positive relationship

- - -1 → Strong negative relationship

- - 0 → No relationship

- Color	      ----->>       Meaning

- - Dark Red	   ----->>      Strong Positive (feature increases together)

- - Dark Blue	    ----->>     Strong Negative (one feature increases, other decreases)

- - White or Light	   ----->>  Weak/No relation

- 📍 Correlation near +1 → Features are similar (move together).

- 📍 Correlation near -1 → Features are opposite.

- 📍 Correlation near 0 → Features are independent.

In [None]:
# Keep only numeric columns
numeric_df = df.select_dtypes(include=['number'])

# Calculate correlation
corr_matrix = numeric_df.corr()

# Plot heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap (Numeric Features Only)')
plt.show()

# Count Plot

- ✅ After seeing this, you will know:

- - Whether your dataset is imbalanced (most real-world fraud datasets are).

- - Whether you need techniques like oversampling/undersampling later during model training.

In [None]:

# Assuming 'Class' is the column where
# 0 = Not Fraud, 1 = Fraud
plt.figure(figsize=(6,4))
sns.countplot(x='IsFraud', data=df)
plt.title('Count of Non-Fraud vs Fraud Cases')
plt.xlabel('Class (0 = Not Fraud, 1 = Fraud)')
plt.ylabel('Count')
plt.show()


# Pairplot (Scatterplots between features)
- ✅ What this code does:

- Picks only a few important features (because pairplot becomes very heavy if we use full dataset).

- Plots scatterplots for each feature pair.

- Colors points based on 'Class' (0 = Non-fraud, 1 = Fraud).

- Observation ------>	Meaning
- - Fraud and non-fraud points are separated ------>	These features are good for model training
- - Fraud and non-fraud points mixed ------>	These features may not help much

- ✅ After pairplot, you will visually know:

- - Which features help in separating fraud cases.

- - Which ones are not so helpful.

In [None]:
# Take only important columns for pairplot
# Example: V1, V2, V3, Amount and Class
selected_columns = ['TransactionAmount', 'Month','IsFraud']

# Make sure columns exist (skip this if you're sure)
selected_columns = [col for col in selected_columns if col in df.columns]

# Plot pairplot
sns.pairplot(df[selected_columns], hue='IsFraud', diag_kind='kde')
plt.show()
