In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

In [None]:
#Load Datasets
try:
    fraud_data = pd.read_csv('../data/raw/Fraud_Data.csv')
    ip_map = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
    credit_card = pd.read_csv('../data/raw/creditcard.csv')
    print("Datasets loaded successfully!")
except FileNotFoundError as e:
    print(f"Error loading datasets: {e}")
    print("Please ensure the data files are in the 'data/raw/' directory.")

In [None]:
# Initial Inspection - Fraud_Data.csv
print("--- Fraud_Data Info ---")
fraud_data.info()
print("\n--- First 5 Rows ---")
print(fraud_data.head())
print(f"\n--- Duplicates: {fraud_data.duplicated().sum()} ---")

In [None]:
# Data Cleaning - Fraud_Data.csv
# Correcting data types for time columns
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
print("\n'signup_time' and 'purchase_time' converted to datetime objects.")

# Check for missing values
print(f"\n--- Missing Values ---\n{fraud_data.isnull().sum()}")

In [None]:
#Initial Inspection - creditcard.csv
print("\n\n--- creditcard.csv Info ---")
credit_card.info()
print("\n--- First 5 Rows ---")
print(credit_card.head())


In [None]:
#Data Cleaning - creditcard.csv
# Check for missing values
print(f"\n--- Missing Values ---\n{credit_card.isnull().sum()}")

# Check for duplicates and remove them
num_duplicates = credit_card.duplicated().sum()
print(f"\n--- Duplicates found: {num_duplicates} ---")
if num_duplicates > 0:
    credit_card.drop_duplicates(inplace=True)
    print(f"Removed {num_duplicates} duplicates. New shape: {credit_card.shape}")


In [None]:
# Initial Inspection - IpAddress_to_Country.csv
print("\n\n--- IpAddress_to_Country.csv Info ---")
ip_map.info()
print("\n--- First 5 Rows ---")
print(ip_map.head())