In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset taken from kaggle

In [None]:
data = pd.read_csv('../input/creditcardfraud/creditcard.csv')
data.head(20)

Imbalanced Data Set

In [None]:
data.info()

**Exploratory Data Analysis**

In [None]:
data.isnull().values.any()

#no null values

# Checking transaction distribution w/countplot

In [None]:
import seaborn as sns

In [None]:
from pylab import rcParams
rcParams['figure.figsize']=8,5

In [None]:

import seaborn as sns
sns.countplot(data=data, x='Class')

#clearly dataset is imbalanced

In [None]:
classes = data['Class'].value_counts()
classes[0],classes[1]
print("No of Valid cases in the dataset: ",classes[0])
print("No of Fraud cases in the dataset: ",classes[1])

#data.shape[0] returns no of rows.
#data.shape[1] returns no of columns.

In [None]:
y = (classes[0]/data.shape[0])*100
z = (classes[1]/data.shape[0])*100


print("Percentage of valid cases: ",y)
print("Percentage of fraud cases: ",z)

In [None]:
import matplotlib.pyplot as plt 
classes.plot(kind = "bar")
plt.xlabel("Class")
plt.ylabel("Number of observartions")
plt.title("Counts of different classes")
plt.show()

In [None]:
# Comparison between fraud and non-fraud cases
plt.scatter(data.loc[data['Class'] == 0]['V11'], data.loc[data['Class'] == 0]['V12'],label='Class #0', alpha=0.5, linewidth=0.15,c='b')
plt.scatter(data.loc[data['Class'] == 1]['V11'], data.loc[data['Class'] == 1]['V12'],label='Class #1', alpha=0.5, linewidth=0.15,c='r')
plt.show()

In [None]:
X = data.drop(['Class'],axis=1)
Y = data['Class']
print(X.shape)
print(Y.shape)

In [None]:
#creating fraud & normal dataset 

fraud=data[data['Class']==1]
normal=data[data['Class']==0]

In [None]:
print(fraud.shape,normal.shape)

# #Analysing difference in amount of money in b/w both the classes

In [None]:
fraud.Amount.describe()

In [None]:
normal.Amount.describe()

# Plotting Amount v/s Transactions for both the transactions type

In [None]:
plot_normal=sns.distplot(normal['Amount'],kde=False,bins=50)
plot_normal.set(yscale='log')
plot_normal.set(xlabel='Amount ($)',ylabel='Number of Normal Transactions')

In [None]:
plot_fraud=sns.distplot(fraud['Amount'],kde=False,bins=50)
plot_fraud.set(xlabel='Amount ($)',ylabel='Number of Fraud Transactions')

#clearly for fraud transactions, transactions amount is small as compared to normal ones

#checking Do fraudulent transactions occur more often during certain time frame?

# Plotting Amount v/s Transactions for both the transactions type

In [None]:

sns.scatterplot(data=normal,x='Time',y='Amount')

In [None]:

sns.scatterplot(data=fraud, x='Time', y='Amount')

#seems no relation b/w fraud transaction and certain time period

# Creating Features and Labels

In [None]:
columns=data.columns.tolist()

state=np.random.RandomState(42)

columns=[c for c in columns if c not in ["Class"]]
target = "Class"
X=data[columns]
y=data[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.3)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Logistic Regression On Imbalanced Dataset

In [None]:
from sklearn.linear_model import LogisticRegression
model_lr1=LogisticRegression()

In [None]:
model_lr1.fit(X_train,y_train)

In [None]:
lr1_probas = model_lr1.predict_proba(X_test)

In [None]:
model_lr1.score(X_test, y_test)

In [None]:
#Making predictions for test set
predictions = model_lr1.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report 
print(classification_report(y_test, predictions)) 

We achieved a good Accuracy of nearly 99.99% accuracy But predicting Fradulent cases F1 score is not Good (0.69). So we have to first Balance the dataset then apply Logistic regression

# Handling imbalanced dataset using undersampling (for log_reg)

In [None]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res,y_res=nm.fit_sample(X,y)

In [None]:
X_res.shape,y_res.shape


# Model Prediction

# Applying Logistic Regression After Undersampling Dataset

In [None]:
import sklearn


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,train_size=0.3)

In [None]:
y_res_test=y_test
X_res_test=X_test

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:

model_lr=LogisticRegression()

In [None]:
model_lr.fit(X_res,y_res)

In [None]:
model_lr.score(X_test, y_test)

In [None]:
lr_probas = model_lr.predict_proba(X_test)

In [None]:
y_pred=model_lr.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(y_test,y_pred)
print(cm)

147669 + 313 are correctly predicted as we can see in confusion matrix

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
#Making predictions for test set
predictions = model_lr.predict(X_test)
predictions[:500]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
predictions = model_lr.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report 
print(classification_report(y_test, predictions)) 

Here We achieved a good accuracy of 95.93% as well as good F1 score 0.96 for predicting fraudulent cases

# RAndom Forest

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_rf=RandomForestClassifier(n_estimators=50)

In [None]:
model_rf.fit(X_train,y_train)

In [None]:
model_rf.score(X_test,y_test)

In [None]:
rf_probas = model_rf.predict_proba(X_test)

In [None]:
y_pred=model_rf.predict(X_test)

In [None]:
cm=confusion_matrix(y_test,y_pred)
print(cm)

> 198987 + 263 are correctly predicted as we can see in confusion matrix

In [None]:
#Making predictions for test set
predictions = model_rf.predict(X_test)
predictions[:1000]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report 
print(classification_report(y_test, predictions)) 

Using Random Forest Algorithm We achieved a good accuracy of 99.94% as well as good F1 score 0.82 for predicting fraudulent cases

In [None]:
lr1_probas = model_lr1.predict_proba(X_test)

rf_probas = model_rf.predict_proba(X_test)

In [None]:
probas_list = [lr1_probas, rf_probas]
clf_names = ['Logistic Regression W/o Undersampling' , 'Random Forest']

In [None]:
import scikitplot as skplt
skplt.metrics.plot_calibration_curve(y_test,
                                     probas_list,
                                     clf_names, n_bins=15,
                                     figsize=(12,6)
                                     );

In [None]:
print(y_res_test.shape)
print(X_res_test.shape)

In [None]:
probas_list = [lr_probas]
clf_names = ['Logistic Regression with undersampling']
skplt.metrics.plot_calibration_curve(y_res_test,
                                     probas_list,
                                     clf_names, n_bins=15,
                                     figsize=(12,6)
                                     );

# Observations :

Random Forest has a 99.94% accurate than Logistic Regression with 94.4% accuracy.
So overall Random Forest Method performed much better in determining the fraud cases.

# Project

Rahul Kasaudhan (1809113082)


Gyanesh Sharma


Sankalp


Shivansh