# Credit Card Fraud Detection

In [None]:
import numpy as np
import imblearn 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import os

# import dataset test and train

In [None]:
train_df = pd.read_csv('C:/Users/G PRANAV/Downloads/fraudTrain.csv')
test_df = pd.read_csv('C:/Users/G PRANAV/Downloads/fraudTest.csv')
print("Train fraud Data size=",train_df.shape)
print("Test fraud Data size=",test_df.shape)

In [None]:
print(train_df.describe())  # General statistics

In [None]:
train_df.columns

Check data description and null value of train data

In [None]:
test_df.head(5)

In [None]:
train_df.info()

In [None]:
test_df.info()

# Data analysis and Visualization

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(train_df.corr(),cmap="YlGnBu", annot=True)
plt.show()

Note: From heatmap amount has highly correlated with is_fraud

(Heatmap if value close to 1 and color close to white meaning 2 features have highly correlated)

## Most amount scamed and there gender information

In [None]:
train_df.loc[train_df['is_fraud'] == 1].sort_values('amt', ascending=False).head(2)

In [None]:
# Age
plt.figure(figsize=(9,7))
plt.title('Number of frauds by category')
sns.barplot(x="gender", y='is_fraud' ,data=train_df)

In [None]:
# Category
plt.figure(figsize=(16,8))
plt.title('Number of frauds by category')
sns.barplot(x="category", y='is_fraud' ,data=train_df)

# Pre-Processing
## Downsampling Data
Use downsampling solution because targets are imbalance data

In [None]:
print("Number of is_fraud data")
print(train_df['is_fraud'].value_counts())

is_fraud = 0 has 1289169 data

is_fraud = 1 has 7506 data

Amount of is_fraud is very different, it can make overfitting

In [None]:
from sklearn.utils import resample 
# .iloc[:,22] = is_fraud
df_minority = train_df[train_df.iloc[:,22].values==0]
df_majority = train_df[train_df.iloc[:,22].values==1] 
 
# Downsample majority class
df_minority_downsampled = resample(df_minority,
                                 n_samples=7506,
                                 random_state=42)
 
# Combine minority class with downsampled majority class
train_df_final = pd.concat([df_minority_downsampled, df_majority])
 
# final train data
train_df_final.info()

In [None]:
print("Number of is_fraud data",train_df_final['is_fraud'].value_counts())

Note: Now is_fraud = 0 and is_fruad = 1 have amount = 7506 both

# Data Tranformation

### Train Data

In [None]:

train_df_final['trans_date_trans_time'] = pd.to_datetime(train_df_final['trans_date_trans_time'])
train_df_final['week_number'] = train_df_final['trans_date_trans_time'].dt.dayofweek
assert train_df_final['week_number'].max() == 6
train_df_final['month_number'] = train_df_final['trans_date_trans_time'].dt.month
assert train_df_final['month_number'].max() == 12
train_df_final['year'] = train_df_final['trans_date_trans_time'].dt.year
train_df_final.head()

### Test Data

In [None]:
test_df['trans_date_trans_time'] = pd.to_datetime(test_df['trans_date_trans_time'])
test_df['week_number'] = test_df['trans_date_trans_time'].dt.dayofweek
assert test_df['week_number'].max() == 6
test_df['month_number'] = test_df['trans_date_trans_time'].dt.month
assert test_df['month_number'].max() == 12
test_df['year'] = test_df['trans_date_trans_time'].dt.year
test_df.head()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

## One-Hot Endcoding
Category is Nominal Data that cannot work with model,So I will change category to numerical by One Hot Encoding

### Train Data

In [None]:
category_onehot = pd.get_dummies(train_df_final.category, prefix='category')
train_df_final = train_df_final.join(category_onehot)
train_df_final.head()

### Test Data

In [None]:
category_onehot_test_data = pd.get_dummies(test_df.category, prefix='category')
test_df = test_df.join(category_onehot_test_data)
test_df.head()

## Gender
Change gender from nominal to numerical

In [None]:

train_df_final['gender'] = train_df_final['gender'].replace(['F','M'],[0,1])
test_df['gender'] = test_df['gender'].replace(['F','M'],[0,1])
print('Gender of train dataset', train_df_final['gender'].value_counts())
print('Gender of test dataset', test_df['gender'].value_counts())

## Merchant
Convert Marchant to be numerical data

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
x_train = train_df_final['merchant']
train_df_final['merchant_number'] = label_encoder.fit_transform(x_train)
x_test = test_df['merchant']
test_df['merchant_number'] = label_encoder.fit_transform(x_test)
print('Merchant Number of train dataset',train_df_final['merchant_number'])
print('Merchant Number of test dataset',test_df['merchant_number'])

## Age
Find Age from date of birth data

In [None]:
from datetime import date
def calculate_age(row):
    today = date.today()
    return today.year - row['dob'].year - ((today.month, today.day) < (row['dob'].month, row['dob'].day))


### Train and test Age calculate

In [None]:
train_df_final['dob'] = pd.to_datetime(train_df_final['dob'])
train_df_final['age'] = train_df_final['dob']
train_df_final['age'] = train_df_final.apply (lambda row: calculate_age(row), axis=1)

test_df['dob'] = pd.to_datetime(test_df['dob'])
test_df['age'] = test_df['dob']
test_df['age'] = test_df.apply (lambda row: calculate_age(row), axis=1)

print('Age of train dataset', train_df_final['age'].head(3))
print('Age of test dataset', test_df['age'].head(3))

### Job
Convert Marchant to be numerical data

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

x_train = train_df_final['job']
train_df_final['job_number'] = label_encoder.fit_transform(x_train)
print(train_df_final['job_number'])
x_test = test_df['job']
test_df['job_number'] = label_encoder.fit_transform(x_test)
print(test_df['job_number'])

# Data Split

In [None]:
#Select Train Data
data_train = train_df_final[['amt','category_shopping_net','category_grocery_pos','category_home','category_misc_net',
                                         'category_kids_pets','category_health_fitness','gender','age','month_number',
                                         'category_food_dining','unix_time','category_personal_care','category_shopping_pos','is_fraud']]


In [None]:
#Select Test Data
data_test = test_df[['amt','category_shopping_net','category_grocery_pos','category_home','category_misc_net',
                                         'category_kids_pets','category_health_fitness','gender','age','month_number',
                                         'category_food_dining','unix_time','category_personal_care','category_shopping_pos','is_fraud']]


In [None]:
# Prepare X_train y_train
X_train = data_train[['amt','category_shopping_net','category_grocery_pos','category_home','category_misc_net',
                                         'category_kids_pets','category_health_fitness','gender','age','month_number',
                                         'category_food_dining','unix_time','category_personal_care','category_shopping_pos']]
y_train = data_train['is_fraud']



In [None]:
# Prepare X_test y_test
X_test = data_test[['amt','category_shopping_net','category_grocery_pos','category_home','category_misc_net',
                                         'category_kids_pets','category_health_fitness','gender','age','month_number',
                                         'category_food_dining','unix_time','category_personal_care','category_shopping_pos']]
y_test = data_test['is_fraud']


# Scaler Data

In [None]:
from sklearn import preprocessing

In [None]:
# Scale X_train
scaler = preprocessing.MinMaxScaler()
newValue = scaler.fit_transform(X_train)
X_train = pd.DataFrame(newValue, columns=X_train.columns)
X_train.head()

In [None]:
# Scale X_test
scaler = preprocessing.MinMaxScaler()
newValue = scaler.fit_transform(X_test)
X_test = pd.DataFrame(newValue, columns=X_test.columns)
X_test.head()

# Model


## Support Vector Machine

In [None]:
from sklearn.svm import SVC  
clf = SVC(kernel='linear') 
  
# fitting x samples and y classes 
clf.fit(X_train, y_train)

In [None]:
#Predict
y_pred = clf.predict(X_test)


In [None]:
# Confusion matrix
cf=confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cf/np.sum(cf), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:

print("Classification report")
print(classification_report(y_test, y_pred))


## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:

# Confusion matrix
cf=confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cf/np.sum(cf), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:

print("Classification report")
print(classification_report(y_test, y_pred))


## LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)


In [None]:
#Predict
y_pred = model.predict(X_test)


# Confusion matrix

In [None]:
# Confusion matrix
cf=confusion_matrix(y_test,y_pred)


In [None]:
X_test.shape

In [None]:

plt.figure(figsize=(10,8))
sns.heatmap(cf/np.sum(cf), annot=True, 
            fmt='.2%', cmap='Blues')

# Classification Report

In [None]:

print("Classification report")
print(classification_report(y_test, y_pred))


# ROC_Curve

In [None]:
from sklearn import metrics

In [None]:
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
roc_auc = metrics.auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()

In [None]:
#Dataset
#https://www.kaggle.com/datasets/kartik2112/fraud-detection

In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier

# fit model no training data
clf = XGBClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:

# Confusion matrix
cf=confusion_matrix(y_test,y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cf/np.sum(cf), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:

print("Classification report")
print(classification_report(y_test, y_pred))
