In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import the Data**

In [None]:
bank = pd.read_csv(r'/kaggle/input/banksim1/bs140513_032310.csv')

In [None]:
bank.head() # to get first five columns 

In [None]:
bank.shape # shape (no.of rows,no.of columns)

In [None]:
bank.info()  # information of a dataset

In [None]:
bank.describe()  # to count statistical value of numerical columns 

In [None]:
bank.isnull().sum() # gives information about null value in data set

In [None]:
bank.corr()  # to find correlation betweeen two numerical values 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt  # import visualization module 

In [None]:
%matplotlib inline

**Visualization of fraud transaction**

In [None]:
print(bank['fraud'].value_counts())
sns.countplot(x='fraud',data=bank)  # visualization of fraud transactions 

In [None]:
# groupby the categories and their respective transactions and fraud
bank.groupby('category')[['amount','fraud']].sum() 

In [None]:
# groupby the categories and their respective amount and fraud mean
bank.groupby('category')[['amount','fraud']].mean()

In [None]:
(bank.groupby('age')['fraud'].mean()*100).reset_index().sort_values(by='age' , ascending = False).rename(columns={'fraud':'fraud_percent'})

In [None]:
bank.head()

In [None]:
bank['zipcodeOri'].nunique() # no of unique values 

In [None]:
bank['zipMerchant'].nunique()

In [None]:
# drop the column 'zipcodeOri',zipMerchant from the table 
bank.drop(['zipcodeOri','zipMerchant'],axis=1,inplace=True)

In [None]:
bank.shape

In [None]:
bank.head(3)

**Transformation**

In [None]:
# to transform all the column which contains the object or string 
col_categorical = bank.select_dtypes(include= ['object']).columns
for col in col_categorical:
    bank[col] = bank[col].astype('category')
bank[col_categorical] = bank[col_categorical].apply(lambda x: x.cat.codes)
bank.head(5)

In [None]:
# to divide the columns in x and y .iloc[row,column]
X = bank.iloc[:,0:7]
y = bank.iloc[:,-1]

**Oversampling**

In [None]:
# solving oversampling problem
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE()
X_res, y_res = sm.fit_resample(X, y)
y_res = pd.DataFrame(y_res)

In [None]:
#before oversampling
y.value_counts()

In [None]:
# after oversampling 
y_res.value_counts()

**Train Test split**

In [None]:
# train_test split
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

In [None]:
X_train

In [None]:
y_train

**Standardization**

In [None]:
#scaling the value 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled,columns=X_train.columns)

In [None]:
X_train_scaled

In [None]:
X_test_scaled = pd.DataFrame(X_test_scaled , columns=X_test.columns)

In [None]:
X_test_scaled

In [None]:
y_train

In [None]:
y_test

**Logistic Regression**

In [None]:
# applying logistic model 
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
lr.fit(X_train_scaled,y_train)

In [None]:
y_predict = lr.predict(X_test_scaled)

In [None]:
y_predict

In [None]:
print(X_test.shape)
print(y_predict.shape)

In [None]:
# Evaluation metrics 
from sklearn.metrics import confusion_matrix,classification_report

**Classification report for logistic Regression**

In [None]:
print("Classification Report for logistic regresssion : \n", classification_report(y_test, y_predict))

In [None]:
print(confusion_matrix(y_test,y_predict))

**KNeighborsClassifier**

In [None]:
# kneigborsclassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5,p=1)

In [None]:
knn.fit(X_train_scaled,y_train)

In [None]:
y_predict = knn.predict(X_test_scaled)

In [None]:
print("Classification Report for K-Nearest Neighbours: \n", classification_report(y_test, y_predict))
print("Confusion Matrix of K-Nearest Neigbours: \n", confusion_matrix(y_test,y_predict)) 