<a href="https://colab.research.google.com/github/robinacharya/Fraud-Detection-Hackathon/blob/main/Fraud_Detection_ZS_Hackathon_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'zs-hackathon-dataset-eda:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4621076%2F7874773%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240713%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240713T155327Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8d1cca84c9e37392f25d195a7fc3f92ad54c12f3c84a6423228b0e01ce8e2e1f02e0e0fdc5538286624945f58802047c0c81ed9e9680c59fdcad9decde80bc934e641353d8d403f0801050fc025b22152303993ab3deb239986751032bf25ae812b26aafc74dce2f57f96ef738635dae189576c24663a6947ac66f7180dc37e17ed8b5625fbc647c4cbbefd9161290628f349d5ed9adf84979fb3adcb271d6c01fe3867019cbaa6fc51c7b01a7c36254d56b7d713f61dab146ab803a1a92b686a2aea5f4555256836442775adb4072efc3d604d50d9e5539bf80fd99d25fc3716c4ca072c828e40f30bc4fe93cd86fb914ffd91a5f97dcfbf8b6e43c863106c6'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Table of Contents<a id="back-to-table-of-contents"></a>

1. [Introduction](#introduction)
1. [Problem Description](#problem-description)
1. [Import Libraries](#import-libraries)
1. [Download Datasets](#download-dataset)
1. [Data Read](#data-read)
1. [EDA](#eda)
1. [Feature Engineering](#feature-engineering)
   - [Feature Creation](#feature-creation)   
   - [Feature Selection](#feature-selection)
   - [Null Imputation](#null-imputation)
1. [Preparing to Modeling](#preparing-to-modeling)
   - [Encoding Categorical Features](#encoding-categorical-features)
   - [Creation of Training and Validation Sets](#creation-of-training-and-validaiton-sets)
1. [Baseline models and test for all features](#baseline-models-and-test-for-all-features)
   - [Logistic Regression](#baseline-logistic-regression)
   - [Decision Tree Classifier](#baseline-decision-tree-classifier)
   - [Random Forests Classifier](#baseline-random-forests-classifier)
   - [XGB Classifier](#baseline-xgb-classifier)
   - [LGBM Classifier](#baseline-lgbm-classifier)
1. [Model with Hyperparameter Tuning](#models-with-hyperparameter-tuning)
   - [Feature Importance](#feature-importance)
1. [Final Model with Important Features](#final-models-with-important-features)
   - [K-Fold Validation](#k-fold-validation)
   - [Final Model Prediction](#final-model-prediction)
1. [Conclusion](#conclusion)

# Introduction <a id="introduction"></a>
[Back to Table of Contents](#back-to-table-of-contents)

We are living in 21st century and things around us are swiftly evolving. Technological
advancements have made our lives a lot easier in most aspects, especially the streamlining of
the payments. Thanks to the banking services, which have been the easiest ever since. However,
the increase in technology invites more fraudulent practices. With so many payments happening
all around, it is hard to track and differentiate fraudulent transactions from the legitimate ones.
This challenge intends to solve the same problem. The goal will be to create a solution that tracks
these payments and identify the fraudulent transactions using the information provided.

# Problem Description <a id="problem-description"></a>
[Back to Table of Contents](#back-to-table-of-contents)

A college has introduced a smart system which can track all the transactions of each student.
Transactions to each other, payments to college canteen and shops, withdrawal of cash from
college ATM, etc. The payments can be made using online methods (net banking, cards, UPI) or
offline (using Debit/Credit Card as POS centers). The idea behind tracking the student payments
was to decrease the ambiguity of defaulters between the college students, making the transaction
process smooth and safe for the students. As the system was built, a few students hacked into
the system database and started misusing it for their own benefit. To tackle the problem, an
existing software was brought which tracks the transactions and predicts the fraudulent ones
which backtracks the student who performed the transaction (the hacker). This solution will make
the solution more robust from these hackers. However, the solution was not entirely successful
as it only flagged the transactions where the amount was more than ₹ 200,000.
**The task is to create a model using the data provided, which takes in the student and transaction
information and predicts whether the transaction is a legitimate transaction or a fraudulent
transaction.**

# Import Libraries <a id="import-libraries"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import shap
from imblearn.combine import SMOTEENN
from sklearn.datasets import make_classification
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, precision_recall_curve, auc, f1_score, confusion_matrix, average_precision_score
from datetime import datetime, timedelta
import math
from matplotlib_venn import venn2
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt import space_eval
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

# Download Datasets <a id="download-dataset"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Read <a id="data-read"></a>
[Back to Table of Contents](#back-to-table-of-contents)

**Reading all csv files**

In [None]:
Transactions = pd.read_csv('/kaggle/input/zs-hackathon-dataset-eda/Transactions.csv')
Students = pd.read_csv('/kaggle/input/zs-hackathon-dataset-eda/Students.csv')
Receiver = pd.read_csv('/kaggle/input/zs-hackathon-dataset-eda/Receiver.csv')
Sender = pd.read_csv('/kaggle/input/zs-hackathon-dataset-eda/Sender.csv')

**Reading all excel files**

In [None]:
Processed_Hacker_Check = pd.read_excel('/kaggle/input/zs-hackathon-dataset-eda/Processed_Hacker_Check.xlsx')
Unprocessed_Hacker_Check = pd.read_excel('/kaggle/input/zs-hackathon-dataset-eda/Unprocessed_Hacker_Check.xlsx')

In [None]:
Sample_Submission = pd.read_csv('/kaggle/input/zs-hackathon-dataset-eda/Sample_Submission.csv')

# EDA <a id="eda"></a>
[Back to Table of Contents](#back-to-table-of-contents)

**Transactions data profiling**

In [None]:
Transactions.head()

**Transaction table contains all transaction with Sender,Receiver time and other details arround transaction**

In [None]:
Transactions.shape

# Lets explore each of the input data tables starting with transaction  **univerieate analysis**

In [None]:
grouped_df = Transactions.groupby(['transactionType']).size()

grouped_df.plot(kind='bar', stacked=False, color=plt.cm.tab20.colors)

plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.title('Transaction Type Distribution')
plt.legend(['Count'], title='Legend')

plt.tight_layout()
plt.show()

**Insights from Transaction Type Distribution Chart:**
* The bar chart reveals significant variability in transaction types.
* "Cash Outflow" is notably predominant, indicating a high frequency of transactions.
* Conversely, "Debit to Account" emerges as the least frequent transaction type.
* "Money Payment" and "Cash Inflow" exhibit similar proportions, suggesting comparable frequencies.
* "Money Transfer" falls between these two, indicating a frequency similar to both.
* Overall, the visualization effectively showcases the distribution of transaction activities, emphasizing the varying prevalence of each type within the dataset.

# Let's do the same for transaction device

In [None]:
grouped_df = Transactions.groupby(['transaction_device']).size()

plt.figure(figsize=(6, 6))
plt.pie(grouped_df, labels=grouped_df.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.tab20.colors)
plt.title('Transaction Device Distribution')
plt.axis('equal')

plt.show()

**Insights from Transaction Device Distribution Pie Chart:**

* The pie chart illustrates uniform distribution among transaction devices.
* ATM card, Net banking, Check, and UPI contribute equally to transaction activities.

*** How about the comments!**

In [None]:
grouped_df = Transactions.groupby(['comments']).size()

grouped_df.plot(kind='bar', stacked=False, color=plt.cm.tab20.colors)

plt.xlabel('Comments')
plt.ylabel('Count')
plt.title('Comments Distribution')
plt.legend(['Count'], title='Legend')

plt.tight_layout()
plt.show()

**Insights from Comments Bar Chart:**

* Uniform distribution is evident among comments.
* Each comment category exhibits equal frequency.

In [None]:
Students.head()

In [None]:
Students.shape

In [None]:
Receiver.head()

In [None]:
Receiver.shape

In [None]:
Sender.head()

In [None]:
Sender.shape

In [None]:
Processed_Hacker_Check.head()

In [None]:
Processed_Hacker_Check.shape

# Let start Merging all the data into one and **explore other relationships**

**Here, we're getting a list of unique Transaction IDs (TID) from the Transactions table, and also unique identifiers from the Sender and Receiver tables. This helps us gather all the different transactions and people involved in the system, making it easier to keep track of who's doing what.**

In [None]:
Transactions_ID=set(Transactions['TID'])
Sender_ID=set(Sender['TID'])
Receiver_ID=set(Receiver['TID'])

**Here, we're combining all Transaction IDs (TIDs) to create a comprehensive dataset. This allows us to conduct Exploratory Data Analysis (EDA) and extract valuable insights from the data. By doing this, we can enhance our feature engineering process, making it more efficient and effective.**

In [None]:
All_Given_IDs=Transactions_ID.union(Sender_ID).union(Receiver_ID)

**Here, we're making a new column that includes all Transaction IDs (TIDs) found in the Transaction, Sender, and Receiver tables.**

In [None]:
df = pd.DataFrame(All_Given_IDs, columns=['All_IDs'])

**Here, we're building a comprehensive dataframe containing all Transaction IDs (TIDs), and then connecting it with the transactions data using a left join.**

In [None]:
master_df=df.merge(Transactions,left_on="All_IDs",right_on="TID",how="left")
master_df.head()

**Now, we're putting together all the tables with our main dataframe, so that we have everything combined into one single table.**

In [None]:
master_df=master_df.merge(Sender.add_suffix('_send'),left_on="All_IDs",right_on="TID_send",how="left")
master_df=master_df.merge(Receiver.add_suffix('_recieve'),left_on="All_IDs",right_on="TID_recieve",how="left")
master_df=master_df.merge(Students.add_suffix('_orig'),left_on="IdOfOrigStudent",right_on="StudentId_orig",how="left")
master_df=master_df.merge(Students.add_suffix('_dest'),left_on="IdOfDestStudent",right_on="StudentId_dest",how="left")

In [None]:
master_df.head()

In [None]:
Processed_df=master_df.merge(Processed_Hacker_Check,left_on="All_IDs",right_on="TID",how="inner")
Unprocessed_df=master_df.merge(Unprocessed_Hacker_Check,left_on="All_IDs",right_on="TID",how="inner")

In [None]:
processed_master_df = pd.concat([Processed_df, Unprocessed_df], ignore_index=True)
processed_master_df.drop(['TID_x','TID_y',"TID_send","TID_recieve"],axis=1,inplace=True)

**Exciting news! We've successfully computed our master dataframe, which now includes data from all relevant tables.**

# *Quality Check Alert !!*

**We're performing a quality check to verify the number of rows in three dataframes: Processed_df, Unprocessed_df, and processed_master_df.**

In [None]:
print(Processed_df.shape)
print(Unprocessed_df.shape)
print(processed_master_df.shape)

In [None]:
processed_master_df.isHacker.value_counts()

**We found that in our processed_master_df we have 3554 hackers**

In [None]:
processed_master_df.head()

# **Preparing Fraud Master DataFrame for Insight Analysis:**

* Creating a fraud master dataframe to analyze fraudulent transactions.
* Aim to understand the types of transactions targeted by hackers.
* This analysis will facilitate more efficient and effective feature engineering.
* Identifying patterns and characteristics associated with fraudulent activities.
* Insights gained will help enhance fraud detection and prevention strategies.

In [None]:
fraud_master_df = processed_master_df[processed_master_df['isHacker']==1]

In [None]:
fraud_master_df.head()

**Conducting a quality check to ensure that we've accurately captured all instances of hacker activity.**

In [None]:
fraud_master_df.shape

**Next, we'll thoroughly examine the fraud_master_df to uncover insights regarding patterns in fraudulent transactions.**

**Here, we're exploring if we can glean any insights from the transaction types.**

In [None]:
grouped_df = fraud_master_df.groupby(['transactionType']).size()

# Plotting the grouped bar plot
grouped_df.plot(kind='bar', stacked=False, color='skyblue')

# Adding labels, title, and legend
plt.xlabel('Transaction Type')
plt.ylabel('Count of Hackers')
plt.title('Transaction Type Distribution')
plt.legend(['Count of Hackers'], title='Legend')

# Display the plot
plt.tight_layout()
plt.show()

**Insights from Transaction Type Analysis:**

**Fraudulent transactions are exclusively found in the "Cash Outflow" and "Money Transfer" transaction types.**
* This discovery sheds light on potential indicators for identifying fraudulent activities.
* Focusing on these transaction types can aid in pinpointing and mitigating fraudulent behavior more effectively.

**Now we're investigating the comments section to see if it holds any useful information or patterns that could provide insights.**

In [None]:
grouped_df = fraud_master_df.groupby(['comments']).size()

# Creating the pie chart
plt.figure(figsize=(6, 6))
plt.pie(grouped_df, labels=grouped_df.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.tab20.colors)
plt.title('Comments Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Display the pie chart
plt.show()

**Insights from Comments Pie Chart:**

* Uniform distribution is evident among comments.
* Each comment category exhibits equal frequency.

**Now, we're going to examine the transaction devices to see if we can discover any useful insights.**

In [None]:
grouped_df = fraud_master_df.groupby(['transaction_device']).size()

# Plotting the grouped bar plot
grouped_df.plot(kind='bar', stacked=False, color=plt.cm.tab20.colors)

# Adding labels, title, and legend
plt.xlabel('Transaction Device')
plt.ylabel('Count of Hackers')
plt.title('Transaction Device Distribution')
plt.legend(['Count of Hackers'], title='Legend')

# Display the plot
plt.tight_layout()
plt.show()

**Unfortunately, we didn't uncover any insights as all devices showed an equal contribution to fraudulent transactions.**

In [None]:
processed_master_df.head()

**Next, we'll explore the relationship between the IdOfOrigStudent and IdOfDestStudent to see if we can uncover any insights about fraudulent transactions or hacker activity.**

**1.) "We observe that prefix to origin and destination student id contains some alphabets" lets explore more**

In [None]:
sample_df = processed_master_df[['IdOfOrigStudent' , 'IdOfDestStudent' , 'isHacker']]
sample_df['origin_alphabet'] = sample_df['IdOfOrigStudent'].str[0]
sample_df['dest_alphabet'] = sample_df['IdOfDestStudent'].str[0]

In [None]:
sample_df.groupby(['origin_alphabet' , 'dest_alphabet'])['isHacker'].agg({'count','sum'})

**Indeed, we notice that there are only two types of transactions: C -> C (Children to Children) and C -> M (Children to Merchant). Interestingly, fraudulent transactions are exclusively of the C -> C type, indicating that Children-to-Children transactions are more susceptible to fraudulent activity.**

Let's plot and examine the data to gain a clearer understanding.

We'll plot the data to get a clearer view.

Sure, let's visualize the data to gain a clearer understanding.

In [None]:
import matplotlib.pyplot as plt

grouped_data = sample_df.groupby(['origin_alphabet', 'dest_alphabet'])['isHacker'].agg(['count', 'sum']).reset_index()

# # Plot the bar plot
plt.figure(figsize=(6, 4))
plt.bar(range(len(grouped_data)), grouped_data['count'], color='skyblue', label='Count of isHacker')
plt.bar(range(len(grouped_data)), grouped_data['sum'], color='orange', label='Sum of isHacker')
plt.xlabel('Combination Index')
plt.ylabel('Count/Sum')
plt.title('Count and Sum of isHacker by Origin and Destination Alphabet')
plt.xticks(range(len(grouped_data)), grouped_data['origin_alphabet'] + ' - ' + grouped_data['dest_alphabet'], rotation=90)
plt.legend()
plt.tight_layout()
plt.show()

**It is Evident that Children-Children contains all the fraud transactions where as Children-Merchant contains non-fraud transaction**

In [None]:
import matplotlib.pyplot as plt

grouped_data = sample_df.groupby(['origin_alphabet', 'dest_alphabet'])['isHacker'].sum().reset_index()

plt.figure(figsize=(6, 4))
plt.bar(range(len(grouped_data)), grouped_data['isHacker'], color='red')
plt.xlabel('Combination Index')
plt.ylabel('Count of Hacker Occurrences')
plt.title('Count of Hacker Occurrences by Origin and Destination Alphabet')
plt.xticks(range(len(grouped_data)), grouped_data['origin_alphabet'] + ' - ' + grouped_data['dest_alphabet'], rotation=90)
plt.tight_layout()
plt.show()


**2.) Let's investigate whether there's any relationship between transactions involving the IdOfOrigStudent and IdOfDestStudent. If we identify a relationship, we can consider creating a graph or network to visualize the connections between fraudulent transactions or potentially form communities within the data.**

In [None]:
Orig_ID = set(processed_master_df['IdOfOrigStudent'])
Dest_ID = set(processed_master_df['IdOfDestStudent'])

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

# Calculate the number of unique elements in each set
orig_id = set(processed_master_df['IdOfOrigStudent'])
dest_id = set(processed_master_df['IdOfDestStudent'])

# Create the Venn diagram
venn2([orig_id, dest_id], ('IdOfOrigStudent', 'IdOfDestStudent'))

# Add title
plt.title('Venn Diagram of IdOfOrigStudent and IdOfDestStudent')

# Show the plot
plt.show()


**Unfortunately, our analysis did not reveal any significant relationships or community structures between transactions involving the IdOfOrigStudent and IdOfDestStudent. Thus, this exploration did not yield any fruitful insights for our analysis.**

# Feature Engineering <a id="feature-engineering"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
processed_master_df.head()

In [None]:
processed_master_df.shape

In [None]:
processed_master_df.describe()

# Feature Creation <a id="feature-creation"></a>
[Back to Table of Contents](#back-to-table-of-contents)

# **Now, let's generate some useful features based on the insights gathered from our exploratory data analysis (EDA).**

Let's extract the first alphabet from the IdOfOrigStudent and IdOfDestStudent, as our analysis revealed a significant relationship between children-to-children (C-C) and children-to-merchant (C-M) transactions in fraudulent activities.

In [None]:
processed_master_df['First_IdOfOrigStudent'] = processed_master_df['IdOfOrigStudent'].str[0]
processed_master_df['First_IdOfDestStudent'] = processed_master_df['IdOfDestStudent'].str[0]

Let's utilize the hourstep to extract various features such as 'date', 'week_number', 'month', 'day_of_month', 'day_of_week', 'hour_of_day', and 'part_of_day', categorized as {Morning, Afternoon, Evening, Night}.

In [None]:
start_date = datetime(2024, 1, 1)

processed_master_df['hourStep'].fillna(processed_master_df['hourStep'].max()+999,inplace=True)

processed_master_df['date'] = processed_master_df['hourStep'].apply(lambda x: start_date + timedelta(hours=x))
processed_master_df['date'] = pd.to_datetime(processed_master_df['date'])

processed_master_df['week_number'] = processed_master_df['date'].dt.isocalendar().week
processed_master_df['month'] = processed_master_df['date'].dt.month
processed_master_df['day_of_month'] = processed_master_df['date'].dt.day
processed_master_df['day_of_week'] = processed_master_df['date'].dt.dayofweek
processed_master_df['hour_of_day'] = processed_master_df['date'].dt.hour

def get_part_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Night'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Afternoon'

processed_master_df['part_of_day'] = processed_master_df['hour_of_day'].apply(get_part_of_day)


In [None]:
grouped_count = processed_master_df.groupby(['part_of_day'])['isHacker'].agg(['count', 'sum'])
grouped_count

**We can observe that the majority of our fraudulent transactions occur during the night, which aligns with our expectations.**

**Now, we're generating additional features based on the Old Balance and New Balance of both the Origin and Destination students.**

In [None]:
processed_master_df["old_send_new_send"]=processed_master_df["oldBalanceOfOrigStudent_send"]-processed_master_df["newBalanceOfOrigStudent_send"]
processed_master_df["old_rec_new_rec"]=processed_master_df["oldBalanceOfDestStudent_recieve"]-processed_master_df["newBalanceOfDestStudent_recieve"]
processed_master_df["old_rec_new_rec_amt_rec"]=processed_master_df["old_rec_new_rec"]-processed_master_df["transactionAmount_recieve"]
processed_master_df["old_send_new_send_amt_rec"]=processed_master_df["old_send_new_send"]-processed_master_df["transactionAmount_send"]

**Converting hourstep into sine and cosine to get the cyclic nature of time**

In [None]:
processed_master_df['hourstep_radians'] = (processed_master_df['hourStep'] % 24) * (2 * np.pi / 24)
processed_master_df['cosine'] = np.cos(processed_master_df['hourstep_radians'])
processed_master_df['sine'] = np.sin(processed_master_df['hourstep_radians'])

# Hourly Transaction Tracking

* Calculate the number of transactions before the current one for each 'IdOfOrigStudent' and 'IdOfDestStudent'
* For transactions with isHacker = 1, we'll create a separate column and then use cumcount.

In [None]:
processed_master_df = processed_master_df.sort_values(by=['IdOfOrigStudent', 'hourStep'])

# Calculate the number of transactions before the current one for each 'IdOfOrigStudent' and 'IdOfDestStudent'
processed_master_df['num_transactions_IdOfOrigStudent'] = processed_master_df.groupby(['IdOfOrigStudent']).cumcount()

# For transactions with isHacker = 1, we'll create a separate column and then use cumcount.
processed_master_df['is_hacker_transaction_IdOfOrigStudent'] = processed_master_df['isHacker'] == 1
processed_master_df['num_hacker_transactions_IdOfOrigStudent'] = processed_master_df.groupby(['IdOfOrigStudent'])['is_hacker_transaction_IdOfOrigStudent'].cumsum() - processed_master_df['is_hacker_transaction_IdOfOrigStudent']


processed_master_df = processed_master_df.sort_values(by=['IdOfDestStudent', 'hourStep'])

# Calculate the number of transactions before the current one for each 'IdOfOrigStudent' and 'IdOfDestStudent'
processed_master_df['num_transactions_IdOfDestStudent'] = processed_master_df.groupby(['IdOfDestStudent']).cumcount()

# For transactions with isHacker = 1, we'll create a separate column and then use cumcount.
processed_master_df['is_hacker_transaction_IdOfDestStudent'] = processed_master_df['isHacker'] == 1
processed_master_df['num_hacker_transactions_IdOfDestStudent'] = processed_master_df.groupby(['IdOfDestStudent'])['is_hacker_transaction_IdOfDestStudent'].cumsum() - processed_master_df['is_hacker_transaction_IdOfDestStudent']


In [None]:
processed_master_df.drop(["is_hacker_transaction_IdOfOrigStudent",'is_hacker_transaction_IdOfDestStudent'],axis=1,inplace=True)

# Feature Selection <a id="feature-selection"></a>
[Back to Table of Contents](#back-to-table-of-contents)

Here we are eliminating the Column ID as it does not significantly contribute to the model's predictive capabilities, hence simplifying the dataset.

In [None]:
processed_master_df.columns

In [None]:
columns_to_keep=['All_IDs','transactionType',
       'hourStep', 'transaction_device', 'comments',
       'oldBalanceOfOrigStudent_send', 'newBalanceOfOrigStudent_send',
       'transactionAmount_send', 'oldBalanceOfDestStudent_recieve',
       'newBalanceOfDestStudent_recieve', 'transactionAmount_recieve',
       'StudentAge_orig',
       'StudentYear_orig', 'TransactionFrequency_orig',
       'NumberOfTransactionsTillDate_orig', 'AccountType_orig',
       'StudentAge_dest',
       'StudentYear_dest', 'TransactionFrequency_dest',
       'NumberOfTransactionsTillDate_dest', 'AccountType_dest',
       'isHacker', 'First_IdOfOrigStudent',
       'First_IdOfDestStudent', 'week_number', 'month', 'day_of_month',
       'day_of_week', 'hour_of_day', 'part_of_day', 'old_send_new_send',
       'old_rec_new_rec', 'old_rec_new_rec_amt_rec',
       'old_send_new_send_amt_rec', 'hourstep_radians', 'cosine', 'sine',
       'num_transactions_IdOfOrigStudent',
       'num_hacker_transactions_IdOfOrigStudent',
       'num_transactions_IdOfDestStudent',
       'num_hacker_transactions_IdOfDestStudent']

In [None]:
df=processed_master_df[columns_to_keep]

In [None]:
df.head()

In [None]:
df.columns

# Null Imputation <a id="null-imputation"></a>
[Back to Table of Contents](#back-to-table-of-contents)

**Making a copy of df so that we can use it further for model prediction for those handle nulls by themself like XgBoost**

In [None]:
df_with_nulls = df.copy()

Here we are completing the dataset by substituting missing values with the mean in all numeric columns except for the output column, "isHacker." This ensures data integrity and prevents any potential confusion for the model during analysis.

In [None]:
# Select all numeric columns except column 'A'
numeric_columns_except_is_Hacker = df.select_dtypes(include=['number']).columns.difference(['isHacker','All_IDs'])

# Fill null values with the mean of selected numeric columns
df[numeric_columns_except_is_Hacker] = df[numeric_columns_except_is_Hacker].fillna(df[numeric_columns_except_is_Hacker].mean())


In [None]:
df=df.drop("All_IDs",axis=1)
df.columns

In [None]:
columns_with_null = df.columns[df.isnull().any()].tolist()

In [None]:
columns_with_null

**Hence we have removed all the nulls from numeric columns**

# Encoding Categorical Features <a id="encoding-categorical-features"></a>
[Back to Table of Contents](#back-to-table-of-contents)

**Now that we've removed all null values from the numeric columns, let's proceed to handle nulls in the categorical columns by encoding them.**

In [None]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df = pd.get_dummies(df, columns=categorical_cols,dtype=int)


In [None]:
df.head()

# Preparing to Modeling <a id="preparing-to-modeling"></a>
[Back to Table of Contents](#back-to-table-of-contents)

# Creation of Training and Validation Sets <a id="creation-of-training-and-validaiton-sets"></a>
[Back to Table of Contents](#back-to-table-of-contents)

* Now in our current df we have both for which we already have our correct predections and also those for which we need to make our predections
* As a next step we need to split our training sets into train and test sets so that we can train our model and test till our model performs well on our test sets
* When our model gets trained well we can have predecitons for the validation sets

In [None]:
given_df=df[df['isHacker'].notna()]
predict_df=df[df['isHacker'].isna()]

print(given_df.shape)
print(predict_df.shape)

**Here we are separating our input variables, also known as features, from our target variable, which is the value we aim to predict, in order to train our model effectively. This division enables us to feed the features into the model to learn patterns and relationships, ultimately enabling it to make accurate predictions or classifications based on the provided features.**

In [None]:
X=given_df.drop('isHacker',axis=1)
y=given_df['isHacker']

In [None]:
columns_with_null = X.columns[X.isnull().any()].tolist()

In [None]:
columns_with_null

**Quality check to see if we have removed all the nulls from our features**

**Now we are spliting our training sets into train and test**

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,train_size=0.8,test_size=0.2,random_state=0)
X_test = X_valid

In [None]:
y_train.value_counts()/len(y_train) * 100

# The Data is highly imbalanced. Our goal is to detect the fraudulent payments. For our model to be effective we need to balance the data first or select the model and its hyperparameters to balance the effect of class-Imbalance

# Baseline models and test for all features <a id="baseline-models-and-test-for-all-features"></a>
[Back to Table of Contents](#back-to-table-of-contents)

# Logistic Regression <a id="baseline-logistic-regression"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
# Initialize logistic regression model
logistic_model = LogisticRegression()

# Train the model
logistic_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_valid = logistic_model.predict(X_valid)

# Calculate evaluation metrics
precision = precision_score(y_valid, y_pred_valid)
recall = recall_score(y_valid, y_pred_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
auroc = roc_auc_score(y_valid, y_pred_valid)
f1score = f1_score(y_valid, y_pred_valid)
# Compute probabilities and precision-recall curve
y_scores = logistic_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_valid, y_scores)

# Compute AUC for the precision-recall curve
auc_pr = auc(recall, precision)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("AUROC:", auroc)
print("F1 Score:", f1score)
print("AUPRC:", auc_pr)

cm = confusion_matrix(y_valid, y_pred_valid)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Decision Tree Classifier <a id="baseline-decision-tree-classifier"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
# Initialize decision tree model
decision_tree_model = DecisionTreeClassifier(random_state=0)

# Train the model
decision_tree_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_valid = decision_tree_model.predict(X_valid)

# Calculate evaluation metrics
precision = precision_score(y_valid, y_pred_valid)
recall = recall_score(y_valid, y_pred_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
auroc = roc_auc_score(y_valid, y_pred_valid)
f1score = f1_score(y_valid, y_pred_valid)
# Compute probabilities and precision-recall curve
y_scores = decision_tree_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_valid, y_scores)

# Compute AUC for the precision-recall curve
auc_pr = auc(recall, precision)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("AUROC:", auroc)
print("F1 Score:", f1score)
print("AUPRC:", auc_pr)

cm = confusion_matrix(y_valid, y_pred_valid)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Random Forests Classifier <a id="baseline-random-forests-classifier"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
# Initialize Random Forest Classifier model
random_forest_model = RandomForestClassifier(random_state=0)

# Train the model
random_forest_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_valid = random_forest_model.predict(X_valid)

# Calculate evaluation metrics
precision = precision_score(y_valid, y_pred_valid)
recall = recall_score(y_valid, y_pred_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
auroc = roc_auc_score(y_valid, y_pred_valid)
f1score = f1_score(y_valid, y_pred_valid)
# Compute probabilities and precision-recall curve
y_scores = random_forest_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_valid, y_scores)

# Compute AUC for the precision-recall curve
auc_pr = auc(recall, precision)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("AUROC:", auroc)
print("F1 Score:", f1score)
print("AUPRC:", auc_pr)

cm = confusion_matrix(y_valid, y_pred_valid)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# XGB Classifier <a id="baseline-xgb-classifier"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
# Initialize XGBoost Classifier model
xgb_model = xgb.XGBClassifier()

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_valid = xgb_model.predict(X_valid)

# Calculate evaluation metrics
precision = precision_score(y_valid, y_pred_valid)
recall = recall_score(y_valid, y_pred_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
auroc = roc_auc_score(y_valid, y_pred_valid)
f1score = f1_score(y_valid, y_pred_valid)
# Compute probabilities and precision-recall curve
y_scores = xgb_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_valid, y_scores)

# Compute AUC for the precision-recall curve
auc_pr = auc(recall, precision)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("AUROC:", auroc)
print("F1 Score:", f1score)
print("AUPRC:", auc_pr)

cm = confusion_matrix(y_valid, y_pred_valid)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# LGBM Classifier <a id="baseline-lgbm-classifier"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
# Initialize LGBM Classifier model
lgbm_model = lgb.LGBMClassifier()

# Train the model
lgbm_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_valid = lgbm_model.predict(X_valid)

# Calculate evaluation metrics
precision = precision_score(y_valid, y_pred_valid)
recall = recall_score(y_valid, y_pred_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
auroc = roc_auc_score(y_valid, y_pred_valid)
f1score = f1_score(y_valid, y_pred_valid)
# Compute probabilities and precision-recall curve
y_scores = lgbm_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_valid, y_scores)

# Compute AUC for the precision-recall curve
auc_pr = auc(recall, precision)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("AUROC:", auroc)
print("F1 Score:", f1score)
print("AUPRC:", auc_pr)

cm = confusion_matrix(y_valid, y_pred_valid)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

Create a tabluar view of all the model performances

# BEST - Model with Hyperparameter Tuning <a id="models-with-hyperparameter-tuning"></a>
[Back to Table of Contents](#back-to-table-of-contents)

**This is used to select top features for our Final Model**

**Encoding Categorical Features**

In [None]:
categorical_cols = df_with_nulls.select_dtypes(include=['object', 'category']).columns
df_with_nulls = pd.get_dummies(df_with_nulls, columns=categorical_cols,dtype=int)


In [None]:
# from sklearn.preprocessing import StandardScaler

# columns_to_scale = df_with_nulls.select_dtypes(include=['float64', 'int64']).columns
# columns_to_exclude = ['All_IDs', 'isHacker']

# columns_to_scale = [col for col in columns_to_scale if col not in columns_to_exclude]

# # Standard scaling on numeric columns
# scaler = StandardScaler()
# df_with_nulls[columns_to_scale] = scaler.fit_transform(df_with_nulls[columns_to_scale])



In [None]:
df_with_nulls.head()

In [None]:
given_df=df_with_nulls[df_with_nulls['isHacker'].notna()]
predict_df=df_with_nulls[df_with_nulls['isHacker'].isna()]

print(given_df.shape)
print(predict_df.shape)

In [None]:
given_df.head()

In [None]:
X=given_df.drop(['isHacker','All_IDs'],axis=1)
y=given_df['isHacker']

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the space of hyperparameters to search
space = {
    'max_depth': hp.choice('max_depth', range(3, 8)),  # Using range for max_depth
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators': 300,  # Fixed value, so not a hyperparameter to optimize
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10)
}

# Objective function to minimize
def objective(params):
    params['n_estimators'] = 300  # Adding n_estimators as it's a fixed value
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    f1 = f1_score(y_train, y_pred, average='binary')
    return {'loss': -f1, 'status': STATUS_OK}

# Run the hyperparameter search using the TPE algorithm
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=3,
            trials=trials)

# Extract best parameters
best_params = space_eval(space, best)
print("Best hyperparameters:", best_params)

# Train the model with the best parameters
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)

# Evaluate the model on training and testing data
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

metrics_train = {
    'F1': f1_score(y_train, y_pred_train),
    'Accuracy': accuracy_score(y_train, y_pred_train),
    'Precision': precision_score(y_train, y_pred_train),
    'Recall': recall_score(y_train, y_pred_train)
}

metrics_test = {
    'F1': f1_score(y_test, y_pred_test),
    'Accuracy': accuracy_score(y_test, y_pred_test),
    'Precision': precision_score(y_test, y_pred_test),
    'Recall': recall_score(y_test, y_pred_test)
}

print("Training performance:", metrics_train)
print("Testing performance:", metrics_test)

# Compute probabilities and precision-recall curve
y_scores = model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

# Compute AUC for the precision-recall curve
auc_pr = auc(recall, precision)
print('Area under the Precision-Recall curve:', auc_pr)

cm = confusion_matrix(y_test, y_pred_test)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


# Feature Importance <a id="feature-importance"></a>
[Back to Table of Contents](#back-to-table-of-contents)

**Next, we'll assess the feature importance to identify and remove non-important features from our dataset. This will allow us to train our model more efficiently and prevant from overfitting.**

In [None]:
feature_importances = model.feature_importances_ * 100 / sum(model.feature_importances_)

feature_names = X_train.columns.tolist()

feature_imp = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

feature_imp = feature_imp.sort_values(by='Importance', ascending=False)
feature_imp['Importance_cum_sum']=feature_imp['Importance'].cumsum()

In [None]:
feature_imp.head(40)

**Columns to select for final feature**

In [None]:
important_features = feature_imp[feature_imp['Importance_cum_sum']<=95]['Feature'].tolist()


In [None]:
feature_imp

# Final Model with Important Features <a id="final-models-with-important-features"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[important_features], y, test_size=0.25, random_state=42)

# Define the space of hyperparameters to search
space = {
    'max_depth': hp.choice('max_depth', range(3, 8)),  # Using range for max_depth
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators': 300,  # Fixed value, so not a hyperparameter to optimize
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'subsample': hp.uniform('subsample', 0.3,0.5),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 5,12),
    'gamma': hp.uniform('gamma', 0, 5),  # Regularization term, controls node split
    'reg_lambda': hp.uniform('reg_lambda', 0, 5),  # L2 regularization term
    'reg_alpha': hp.uniform('reg_alpha', 0, 5)  # L1 regularization term

}

# Objective function to minimize
def objective(params):
    params['n_estimators'] = 300  # Adding n_estimators as it's a fixed value
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    f1 = f1_score(y_train, y_pred, average='binary')
    return {'loss': -f1, 'status': STATUS_OK}

# Run the hyperparameter search using the TPE algorithm
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=3,
            trials=trials)

# Extract best parameters
best_params = space_eval(space, best)
#best_params={'colsample_bytree': 0.7318073467652191, 'learning_rate': 0.039182513095905605, 'max_depth': 7, 'n_estimators': 300, 'scale_pos_weight': 1.8857050501393158, 'subsample': 0.967149051908317}

print("Best hyperparameters:", best_params)

# Train the model with the best parameters
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)

# Evaluate the model on training and testing data
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

metrics_train = {
    'F1': f1_score(y_train, y_pred_train),
    'Accuracy': accuracy_score(y_train, y_pred_train),
    'Precision': precision_score(y_train, y_pred_train),
    'Recall': recall_score(y_train, y_pred_train)
}

metrics_test = {
    'F1': f1_score(y_test, y_pred_test),
    'Accuracy': accuracy_score(y_test, y_pred_test),
    'Precision': precision_score(y_test, y_pred_test),
    'Recall': recall_score(y_test, y_pred_test)
}

print("Training performance:", metrics_train)
print("Testing performance:", metrics_test)

# Compute probabilities and precision-recall curve
y_scores = model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

# Compute AUC for the precision-recall curve
auc_pr = auc(recall, precision)
print('Area under the Precision-Recall curve:', auc_pr)


cm = confusion_matrix(y_test, y_pred_test)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

**Feature Importance**

In [None]:
feature_importances = model.feature_importances_ * 100 / sum(model.feature_importances_)

feature_names = X_train.columns.tolist()

feature_imp = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

feature_imp = feature_imp.sort_values(by='Importance', ascending=False)
feature_imp['Importance_cum_sum']=feature_imp['Importance'].cumsum()

In [None]:
feature_imp.head(20)

**Shap Plot**

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test)

**Precision and Recall as a function of the decision threshold**

In [None]:
thresholds = np.append(thresholds, max(y_scores))

plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision, label='Precision')
plt.plot(thresholds, recall, label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Precision/Recall')
plt.title('Precision and Recall as a function of the decision threshold')
plt.legend()
plt.grid(True)
plt.show()

# K-Fold Validation <a id="k-fold-validation"></a>
[Back to Table of Contents](#back-to-table-of-contents)

**Here we are trying to validate our model performance by using K-Fold Validation**

**K-fold cross-validation is like testing how well you've learned from your study notes. You divide your notes into k equal parts, then you study with all but one part, and test yourself with the part you left out. You do this k times, rotating which part you leave out each time. Finally, you average your scores to see how well you understood overall.**

In [None]:
X1=X[important_features]

In [None]:
X1.shape

In [None]:


kfold_df = pd.DataFrame()

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.25, random_state=i)
    model1 = xgb.XGBClassifier(**best_params)
    model1.fit(X_train, y_train)
    y_pred_train = model1.predict(X_train)
    y_pred_test = model1.predict(X_test)

    metrics = pd.DataFrame({
        "Fold": [i],
        'F1_train': [f1_score(y_train, y_pred_train)],
        'Accuracy_train': [accuracy_score(y_train, y_pred_train)],
        'Precision_train': [precision_score(y_train, y_pred_train)],
        'Recall_train': [recall_score(y_train, y_pred_train)],
        'F1_test': [f1_score(y_test, y_pred_test)],
        'Accuracy_test': [accuracy_score(y_test, y_pred_test)],
        'Precision_test': [precision_score(y_test, y_pred_test)],
        'Recall_test': [recall_score(y_test, y_pred_test)]
    })


    kfold_df = pd.concat([kfold_df, metrics])

In [None]:
kfold_df.head(10)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Melt the DataFrame to have a single column for metric type
melted_df = pd.melt(kfold_df, id_vars=['Fold'], var_name='Metric_Type', value_name='Value')

# Plotting using Seaborn
plt.figure(figsize=(12, 4))
sns.barplot(x='Fold', y='Value', hue='Metric_Type', data=melted_df, palette='muted')
plt.title('Metrics Across Folds')
plt.xlabel('Fold')
plt.ylabel('Value')
plt.legend(title='Metric Type')
plt.show()


# Final Model Prediction <a id="final-model-prediction"></a>
[Back to Table of Contents](#back-to-table-of-contents)

In [None]:
predict_df.head()

In [None]:
np.sum(model.predict(predict_df[important_features]))

In [None]:

predict_df['isHacker'] = model.predict_proba(predict_df[important_features])[:, 1]
predict_df.columns

In [None]:
predict_df['isHacker']

In [None]:
list(Sample_Submission.columns)

In [None]:
Sample_Submission_1=predict_df[['All_IDs','isHacker']]

In [None]:
Sample_Submission_1.rename(columns={'All_IDs': 'TID'}, inplace=True)


In [None]:
Sample_Submission_1.to_csv("result.csv", index=False)


# Post Modelling and Prediction Analysis

# Check the number of total predicted on the unseen data and this number should be in the similar ratio of processed data

In [None]:
Sample_Submission_1[Sample_Submission_1['isHacker']>=.5]['isHacker'].size

# Hurrey!! we have completed the HACKTHON