In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt###
                               ### Visualisation tools
import seaborn as sns          ###

from sklearn.linear_model import LinearRegression,LogisticRegression,SGDRegressor , Ridge,Lasso
from sklearn.model_selection import train_test_split,GridSearchCV,KFold,cross_val_score
from sklearn.preprocessing import LabelEncoder,StandardScaler
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix
from sklearn.metrics import classification_report,roc_curve,roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')
test = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')

In [None]:
#Concatenate the splitted data
df = pd.concat([train,test],ignore_index=True)
df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
#Shape of the data (rows,columns)
df.shape

In [None]:
#Finding dtypes and other basic info about the features
df.info()

In [None]:
#dividing data into categorical and numerical
df_cat = df.select_dtypes(include = 'object')
df_num = df.select_dtypes(exclude = 'object')
le = LabelEncoder()

In [None]:
nan_df = pd.DataFrame(data = (df.isnull().sum()/len(df))*100,columns = ['% of missing values'])

In [None]:
nan_df

In [None]:
#there is no null values

In [None]:
df['gender_le']  = le.fit_transform(df['gender'])


In [None]:
# Seperating  numerical from nominal
# cutting off some data to avoid heavy cpu usage
df2 = df.select_dtypes(exclude = 'object')
df2 = df2.loc[:149999]


In [None]:
df2.head()

In [None]:
# dropping the feature which is not useful for data analysis
df2 = df2.drop(['cc_num'],axis=1)
# seperating target and independent features
x = df2.drop('is_fraud',axis=1)
y = df2['is_fraud']

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size = 0.7 , random_state = 10)

In [None]:
df_num = df_num.drop(['cc_num'],axis=1)

In [None]:
df_num = df_num.drop('is_fraud',axis=1)

In [None]:
# Scaling the data
ss = StandardScaler()
xtrain[df_num.columns] = ss.fit_transform(xtrain[df_num.columns])
xtest[df_num.columns] = ss.fit_transform(xtest[df_num.columns])


In [None]:
#Building a Logistic Regression Model

lr = LogisticRegression()
model_v1 = lr.fit(xtrain,ytrain)

In [None]:
#  predict and confusion matrix

ypred = model_v1.predict(xtest)
cm  = confusion_matrix(ytest,ypred)
sns.heatmap(cm,annot=True)
plt.show()

In [None]:
#classification report
print(classification_report(ytest,ypred))

In [None]:
#KNN model
knn = KNeighborsClassifier()
model_v2 = knn.fit(xtrain,ytrain)
ypred1 = model_v2.predict(xtest)

In [None]:
cm_knn = confusion_matrix(ytest,ypred1)
sns.heatmap(cm_knn,annot=True)

In [None]:
print(classification_report(ytest,ypred))

In [None]:
#Naive Bayes model
nb =GaussianNB()
model_v3 = nb.fit(xtrain,ytrain)


In [None]:
ypred = model_v3.predict(xtest)
cm = confusion_matrix(ytest,ypred)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(ytest,ypred))

In [None]:
#### ROC curve
## Logistic Regression Model
prob = model_v1.predict_proba(xtest)[:,1]
fpr,tpr,threshold = roc_curve(ytest,prob)
#Plotting Roc Curve
plt.plot(fpr,tpr)
plt.plot([[0,0],[1,1]],color='red',linestyle = '-')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.show()

In [None]:
## KNN Classifier Model
prob1 = model_v2.predict_proba(xtest)[:,1]
fpr,tpr,threshold = roc_curve(ytest,prob1)
#Plotting Roc Curve
plt.plot(fpr,tpr)
plt.plot([[0,0],[1,1]],color='red',linestyle = '-')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.show()

In [None]:
## Naive Bayes Model
prob2 = model_v3.predict_proba(xtest)[:,1]
fpr,tpr,threshold = roc_curve(ytest,prob2)
#Plotting Roc Curve
plt.plot(fpr,tpr)
plt.plot([[0,0],[1,1]],color='red',linestyle = '-[]')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.show()

In [None]:
# Grid search for KNN
params = ({'n_neighbors' : np.arange(1,30,2),
          "metric" : ['minkowski','euclidean','chebyshev','manhattan']})
knn = KNeighborsClassifier()
grid = GridSearchCV(estimator=knn , param_grid = params , scoring = 'f1_weighted' , cv = 5 )
grd = grid.fit(xtrain,ytrain)

In [None]:
grd_df = pd.DataFrame(grd.cv_results_)


In [None]:
grd_df[grd_df['rank_test_score']==1]

In [None]:
grd_df[grd_df['rank_test_score']==5]

**Manhattan distance with 1 neighbor is the best parameters to choose. But the number of neighbors is low for upto rank 4 , so we choose rank 5**

In [None]:

knn = KNeighborsClassifier(n_neighbors=5,metric='manhattan')
model_v4 = knn.fit(xtrain,ytrain)

In [None]:
ypred = model_v4.predict(xtest)
prob = model_v4.predict_proba(xtest)[:,1]


In [None]:
cm = confusion_matrix(ytest,ypred)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(ytest,ypred))

In [None]:
k = KFold(n_splits=5,shuffle=True,random_state=10) 
scores = cross_val_score(knn,xtrain,ytrain,scoring='f1_weighted',cv=5) 

print(scores)
print('Bias error:',(1- np.mean(scores))*100) 
print("Variance error:", (np.std(scores)/np.mean(scores))*100) 

In [None]:
fpr,tpr,threshold = roc_curve(ytest,prob)


In [None]:
#Plotting Roc Curve
plt.plot(fpr,tpr)
plt.plot([[0,0],[1,1]],color='red',linestyle = '--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.show()

# <h>Conclusion</h>
1. KNN Classifier gave the best ROC curve followed by Logistic Regression then Naive Bayes
    1. Out of the grid search we done here , best parameter is manhattan with 1 neighbor
    2. But 1 neighbor is too low to consider as it can lead to overfitting
    3. So I choose the 5th ranked one as it has 5 neighbors
2. But when it comes to scores , I consider f1_weighted average score as the best one ; in that regard all 3 models gave 0.99 in classification report
3. Since KNN Classifier had edge in ROC curve , I am concluding it is as the most reliable model to give a prediction out of the three models

# Prediction

In [None]:
ypred = model_v4.predict(xtest)

In [None]:
prediction = pd.DataFrame()
prediction['Serial No.'] = xtest.index
prediction['is_fraud_pred'] = ypred

In [None]:
prediction[prediction['is_fraud_pred']==1]

In [None]:
real = xtest.join([ytest])

In [None]:
real[real['is_fraud']==1]