In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix


import warnings
warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv('../input/fraud-dataaa/Fraud.csv')

In [3]:
data.head()

In [4]:
data.info()

In [5]:
data['step']=data['step'].astype('int32')
data['isFraud']=data['isFraud'].astype('int32')
data['isFlaggedFraud']=data['isFlaggedFraud'].astype('int32')

#so that we can get small sized data

In [6]:
data.info()

In [7]:
data.describe()

In [8]:
data.isnull().sum()  #checking misssing values

In [9]:
data = data.drop(['nameOrig' , "nameDest" ],axis =1) # we are going to drop these columns because the name and origins ara large data set and not having much relevance in our model

In [10]:
sns.countplot(data['type'])
plt.xlabel("types of transactions")


# the most no. of transactions are in payment and cash out while there are comarably less transation in debit method
print('values')
data['type'].value_counts()

In [11]:
b =np.log(data['isFraud'].value_counts())
sns.countplot(b)


print('normalized no. of value counts')
print(data['isFraud'].value_counts())   # we have very small percentage for fraud transactions
print('        log plot')


In [12]:
'''sns.countplot(data['isFraud'])
plt.xlabel("comparison of frauds")
print('value counts')
data['isFraud'].value_counts() '''

#if you want to see graph in normal values, without log

In [13]:
sns.countplot(data['isFlaggedFraud'])
plt.xlabel("flagged frauds")

print('value counts')
data['isFlaggedFraud'].value_counts()

# there is only 0.000251% of flagged fraud values which is negligible for such a huge dataset, thats why i am going to drop this column also

In [14]:
data = data.drop(['isFlaggedFraud'], axis =1)

In [15]:
# visualizing with other columns


In [16]:
out =data.groupby(['type'])['isFraud'].value_counts()
froud  = pd.DataFrame(out)

froud.columns = ['counts']
froud

# from the count values we have clear values that only those type of tranactions are in fraud list which
#only done by using cash_out and transfer method

In [17]:
sns.countplot(data['type'], hue = data['isFraud'])

In [18]:
#data['amount'] = (data['oldbalanceOrg'] - data['newbalanceOrig'])

In [19]:
data.set_index('step',inplace =True) #setting index to step

In [20]:
#data

In [21]:
plt.plot(data['amount']) # checking how our amount data is distributed with each step

In [22]:
value =data.groupby(['newbalanceOrig'])['isFraud'].value_counts()

In [23]:
value = pd.DataFrame(value)
value.columns = ['counts']
value

# among the total of 8213 fraud transactions, in 8053 cases the whole amount is removed from the account, so if we see
#such a transaction in whuch the user is trying to take out all his money via transfer or cash_out method
#we can condider that, it could be a fraud transaction

In [24]:
# i am going to remove other two columns named ''oldbalanceDest'' and "newbalanceDest" because as mentioned it does not contain information for everyone
# also we already have our required columns which are giving us good knowlegde of fraud transactions

data = data.drop(['oldbalanceDest', 'newbalanceDest'], axis =1)

In [25]:
dummy = pd.get_dummies(data['type'])
#dummy

In [26]:
newdata = pd.concat([data,dummy],axis =1)

In [27]:
newdata.head()

In [28]:
#newdata.groupby(['TRANSFER'])['isFraud'].value_counts()

In [29]:
# dividing into dependent and independent variables

x= newdata.drop(['isFraud'],axis =1)
y = newdata.isFraud

In [30]:
x['PAYMENT']=x['PAYMENT'].astype('float32')
x['DEBIT']=x['DEBIT'].astype('float32')
x['CASH_OUT']=x['CASH_OUT'].astype('float32')
x['CASH_IN']=x['CASH_IN'].astype('float32')
x['TRANSFER']=x['TRANSFER'].astype('float32')

In [31]:
x=x.drop(['type'],axis =1)

In [32]:
#scaling the features

from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler(feature_range=(0,1))
x=minmax.fit_transform(x)

In [33]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.40,random_state =42)

In [34]:
x_train.shape

In [35]:
model = LogisticRegression()
model.fit(x_train,y_train)

In [36]:
print('model score ',model.score(x_test,y_test))
predict = model.predict(x_test)

In [37]:
cross_val = cross_val_score(model,x,y,cv=4)
print('cross val score ',np.mean(cross_val))

In [38]:
print(metrics.classification_report(y_test,predict))

In [39]:
'''from sklearn.model_selection import GridSearchCV

c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

logreg_cv = GridSearchCV(model, param_grid, cv = 4)
logreg_cv.fit(x, y) '''

#not going for hyperpaameter tuning beacuse we already have 99%-100% accuracy

In [40]:
final_data = pd.DataFrame(y_test)
final_data['predict'] = predict
final_data['actual']  = y_test
final_data.head()

In [41]:
# if someone wants to see features and predicted value 

'''l=[]
for i in range(100):
    z=(x[i], predict[i])
    l.append(z)
pd.DataFrame(l) '''

In [42]:
#model.predict([x_test[0]])

In [43]:
y_test.value_counts()

In [44]:
#confusion matrix

In [45]:
conf_matrix = confusion_matrix(y_test,predict)
conf_matrix

In [46]:
sns.heatmap(conf_matrix, annot =True,fmt='',cmap = 'Blues')

In [47]:
sns.regplot(x=data['amount'], y=y, data=data, logistic=True, ci=None)

In [49]:
newdata =newdata.drop(['type','isFraud'],axis =1)
newdata.head(1)

In [50]:
importance = model.coef_
importance =importance.reshape(-1,1)

In [51]:
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
    
# hence we can say we have oldbalanceOrig is the most important feature