In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('creditcard.csv')

# Show first few rows
print("First 5 Rows:")
print(df.head())

In [None]:
df.head()

In [None]:
# Print column names
print("Column Names:")
print(df.columns.tolist())

In [None]:
# Shape of dataset
print(f"Dataset shape: {df.shape}")

# Column information
print("\n Dataset Info:")
print(df.info())



In [None]:
# Descriptive statistics
print("\n  Descriptive Statistics:")
print(df.describe())

In [None]:
# Check for missing values
print("\n Missing Values:")
print(df.isnull().sum())

In [None]:
# Class distribution (fraud vs non-fraud)
print("\n Class Distribution:")
print(df['Class'].value_counts())
print("\nClass Distribution (Normalized):")
print(df['Class'].value_counts(normalize=True))

# Plot class distribution
plt.figure(figsize=(6,4))
sns.countplot(x='Class', data=df)
plt.title("Class Distribution (0 = Not Fraud, 1 = Fraud)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# Correlation matrix (optional, only first 10 or full V1-V28 + Amount)
plt.figure(figsize=(12,10))
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title("Correlation Matrix")
plt.show()

# Distribution of 'Amount' for both classes
plt.figure(figsize=(10,5))
sns.histplot(data=df, x='Amount', hue='Class', bins=20, kde=True)
plt.title("Distribution of Transaction Amount by Class")
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.show()

# Time-based transaction visualization
plt.figure(figsize=(10,5))
sns.histplot(data=df, x='Time', hue='Class', bins=100, kde=False)
plt.title("Transaction Time Distribution by Class")
plt.xlabel("Time (seconds)")
plt.ylabel("Number of Transactions")
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load dataset
df_v1 = pd.read_csv('creditcard.csv')

# Drop 'Time' column
df_v2 = df_v1.drop(columns=['Time'])

# Normalize 'Amount'
scaler_amount = StandardScaler()
df_v2['Amount'] = scaler_amount.fit_transform(df_v2[['Amount']])

# --- Apply PCA to V1-V28 ---
# Step 1: Normalize V1 to V28
v_columns = [f'V{i}' for i in range(1, 29)]
scaler_v = StandardScaler()
v_scaled = scaler_v.fit_transform(df_v2[v_columns])

# Step 2: Apply PCA to normalized V1-V28
pca = PCA(n_components=28)
v_pca = pca.fit_transform(v_scaled)

# Step 3: Create new DataFrame from PCA output
v_pca_df = pd.DataFrame(v_pca, columns=[f'PC{i+1}' for i in range(28)])

# Step 4: Combine with 'Amount' and 'Class'
df_v2_pca = pd.concat([v_pca_df, df_v2[['Amount', 'Class']].reset_index(drop=True)], axis=1)

# Print results
print("PCA applied to V1–V28.")
print("Final DataFrame shape:", df_v2_pca.shape)
print("Final columns:", df_v2_pca.columns.tolist())

# Split into features and labels
X = df_v2_pca.drop(columns=['Class'])
y = df_v2_pca['Class']

print("\nFeatures shape:", X.shape)
print("Target shape:", y.shape)


In [10]:
# partd.to_csv("Combined_LEIE_Medicare_2017_2019_DOWNSIZED_1mil.csv", index=False)
df_v2_pca.to_csv("credit_card_PCA.csv", index=False)