In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from category_encoders import WOEEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

warnings.simplefilter("ignore")

# Load Data
from google.colab import drive
drive.mount('/content/drive')
file_path_train = '/content/drive/My Drive/Colab/fraudTrain.csv'
file_path_test = '/content/drive/My Drive/Colab/fraudTest.csv'
df_train = pd.read_csv(file_path_train, index_col='Unnamed: 0')
df_test = pd.read_csv(file_path_test, index_col='Unnamed: 0')

# Exploratory Data Analysis
df_train.head(3)
df_train.info()
df_train.shape
df_train["is_fraud"].value_counts()
df_train.isna().sum().sum()
df_train.duplicated().sum()

fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(15, 8))
df_train.groupby('gender')['is_fraud'].count().plot.pie(explode=[0.1, 0.1], autopct="%1.1f%%", ax=axes[0])
sns.countplot(x="gender", hue="is_fraud", data=df_train, ax=axes[1])
for p in axes[1].patches:
    axes[1].annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

df_train["is_fraud"].value_counts().plot.pie(labels=["No", "Yes"], autopct="%0.0f%%", figsize=(10, 6))
plt.title("is_fraud Counts")
plt.show()

# Feature Engineering
df_train['trans_date_trans_time'] = pd.to_datetime(df_train['trans_date_trans_time'], format='mixed')
df_test['trans_date_trans_time'] = pd.to_datetime(df_test['trans_date_trans_time'], format='mixed')
df_train['hour'] = df_train['trans_date_trans_time'].dt.hour
df_test['hour'] = df_test['trans_date_trans_time'].dt.hour
df_train['month'] = df_train['trans_date_trans_time'].dt.month
df_test['month'] = df_test['trans_date_trans_time'].dt.month

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
sns.histplot(x='hour', data=df_train[df_train["is_fraud"] == 0], stat="density", bins=24, ax=ax1, color="orange")
sns.histplot(x='hour', data=df_train[df_train["is_fraud"] == 1], stat="density", bins=24, ax=ax2, color="green")
ax1.set_title("Not Fraud")
ax2.set_title("Fraud")
ax1.set_xticks(np.arange(24))
ax2.set_xticks(np.arange(24))
plt.show()

# Data Pre-processing
df_train = df_train.drop(['first', 'unix_time', 'dob', 'cc_num', 'zip', 'city', 'street', 'state', 'trans_num', 'trans_date_trans_time'], axis=1)
df_test = df_test.drop(['first', 'unix_time', 'dob', 'cc_num', 'zip', 'city', 'street', 'state', 'trans_num', 'trans_date_trans_time'], axis=1)
df_train['merchant'] = df_train['merchant'].apply(lambda x: x.replace('fraud_', ''))

# Data Encoding
df_train['gender'] = df_train['gender'].map({'F': 0, 'M': 1})
for col in ['job', 'merchant', 'category', 'lat', 'last']:
    df_train[col] = WOEEncoder().fit_transform(df_train[col], df_train['is_fraud'])

# Down-Sampling and Scaling
major_class = df_train[df_train["is_fraud"] == 0]
minor_class = df_train[df_train["is_fraud"] == 1]
major_class_downsampled = resample(major_class, replace=False, n_samples=len(minor_class))
balanced_df = pd.concat([minor_class, major_class_downsampled], axis=0)
X = balanced_df.drop("is_fraud", axis=1)
y = balanced_df["is_fraud"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=65)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Machine Learning Model Training
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)

# Support Vector Machine
svc_model = LinearSVC()
svc_model.fit(X_train, y_train)
svc_predictions = svc_model.predict(X_test)
svc_accuracy = accuracy_score(y_test, svc_predictions)

# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
gnb_predictions = gnb_model.predict(X_test)
gnb_accuracy = accuracy_score(y_test, gnb_predictions)

# Decision Tree
dt_model = DecisionTreeClassifier(max_depth=1, random_state=0)
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

# XGBoost
xgb_model = XGBClassifier(random_state=0)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

# Results Visualization
algorithms = ['XGBClassifier', 'RandomForest', 'DecisionTree', 'LogisticRegression', 'SVM', 'GaussianNB']
accuracies = [xgb_accuracy, rf_accuracy, dt_accuracy, lr_accuracy, svc_accuracy, gnb_accuracy]
results_df = pd.DataFrame({'Algorithm': algorithms, 'Accuracy': accuracies})

plt.figure(figsize=(7, 5))
plt.bar(results_df['Algorithm'], results_df['Accuracy'], color='skyblue')
plt.xlabel('Algorithm')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Algorithms')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.grid(axis='x')
plt.tight_layout()
plt.show()