In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier

df = pd.read_csv("fraudTest.csv")

print("Original data shape:", df.shape)  

df = df.drop_duplicates()
print("Data shape after dropping duplicates:", df.shape)  

imputer = SimpleImputer(strategy="most_frequent")
df[df.columns] = imputer.fit_transform(df)
print("Data shape after imputing missing values:", df.shape)  

categorical_features = ["category", "gender", "job"]  
numerical_features = ["amt", "city_pop", "lat", "long", "merch_lat", "merch_long"]
target = "is_fraud"

df[target] = pd.to_numeric(df[target], errors='coerce').fillna(0).astype(int)
print("Unique values in target column:", df[target].unique())  
print("Data shape after processing target column:", df.shape)  

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

X = df[numerical_features + categorical_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training data shape:", X_train.shape, y_train.shape)
print("Test data shape:", X_test.shape, y_test.shape)

Original data shape: (555719, 23)
Data shape after dropping duplicates: (555719, 23)
Data shape after imputing missing values: (555719, 23)
Unique values in target column: [0 1]
Data shape after processing target column: (555719, 23)
Training data shape: (444575, 9) (444575,)
Test data shape: (111144, 9) (111144,)


In [12]:
log_reg = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])
log_reg.fit(X_train, y_train)

In [13]:
random_forest = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
random_forest.fit(X_train, y_train)

In [17]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

xgboost = XGBClassifier(n_estimators=50, random_state=42, use_label_encoder=False, eval_metric='mlogloss')

xgboost.fit(X_train, y_train)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:amt: object, city_pop: object, lat: object, long: object, merch_lat: object, merch_long: object, category: object, gender: object, job: object

In [None]:
log_reg.score(X_test, y_test)
random_forest.score(X_test, y_test)
xgboost.score(X_test,y_test)

In [None]:
ensemble_model = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('rf', random_forest),
    ('xg', xgboost)
], voting='soft')
ensemble_model.fit(X_train, y_train)

In [None]:
y_pred = ensemble_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

y_pred = ensemble_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', linewidths=1, linecolor='black')

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

bins = [0, 10, 50, 100, 500, 1000, 5000, np.inf]
labels = ['<10', '10-50', '50-100', '100-500', '500-1000', '1000-5000', '>5000']

df['amount_group'] = pd.cut(df['amt'], bins=bins, labels=labels)

fraud_risk = df.groupby('amount_group')['is_fraud'].mean() * 100  

plt.figure(figsize=(10, 5))
fraud_risk.plot(kind='bar', color='red', alpha=0.75)
plt.xlabel("Transaction Amount Range")
plt.ylabel("Fraud Risk (%)")
plt.title("Fraud Risk Percentage by Transaction Amount")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import matplotlib.pyplot as plt

profession_fraud_risk = df.groupby('job')['is_fraud'].mean().sort_values(ascending=False) * 100

plt.figure(figsize=(12, 6))
profession_fraud_risk.head(10).plot(kind='bar', color='red')
plt.xlabel("Profession")
plt.ylabel("Fraud Risk (%)")
plt.title("Top 10 High-Risk Professions for Fraud")
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


In [None]:
fraud_data = df[df['is_fraud'] == 1]

city_fraud_count = fraud_data['city'].value_counts().head(10)

plt.figure(figsize=(12, 6))
city_fraud_count.plot(kind='bar', color='blue', alpha=0.75)
plt.xlabel("City")
plt.ylabel("Number of Fraud Cases")
plt.title("Top 10 Cities with Most Fraud Cases")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [1]:
import matplotlib.pyplot as plt

fraud_data = df[df['is_fraud'] == 1]

category_fraud_amount = fraud_data.groupby('category')['amt'].sum().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
category_fraud_amount.plot(kind='bar', color='red', alpha=0.75)
plt.xlabel("Category")
plt.ylabel("Total Fraud Amount ($)")
plt.title("Total Fraud Amount by Category")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

NameError: name 'df' is not defined