In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df=pd.read_csv(r"../input/creditcardfraud/creditcard.csv")

In [None]:
# Grab a peek at the data 
df.head()

In [None]:
#describe information about dataset
df.info()

In [None]:
df.describe()

In [None]:
# Determine number of missing values in dataset
df.isnull().sum()


In [None]:
# Determine number of fraud cases in dataset
df['Class'].value_counts()

In [None]:
sns.countplot(df["Class"],data=df)
plt.title("Class Distrubution",fontsize=14)

In [None]:
# Seperate total data into non-fraud and fraud cases
fraud = df[df.Class == 0] #save non-fraud df observations into a separate df
normal = df[df.Class == 1] #do the same for frauds

#### Only 0.17% fraudulent transaction out all the transactions. The data is highly Unbalanced. Lets first apply our models without balancing it and if we don’t get a good accuracy then we can find a way to balance this dataset. But first, let’s implement the model without it and will balance the data only if needed.

In [None]:
print("Amount details of the fraudulent transaction") 
fraud.Amount.describe()

In [None]:
print("details of valid transaction") 

normal.Amount.describe() 

As we can clearly notice from this, the average Money transaction for the fraudulent ones is more. This makes this problem crucial to deal with.

In [None]:
# plot the histogram of each parameter
df.hist(figsize = (20, 20))
plt.show()

In [None]:
sns.scatterplot(x="Amount",y="Time",data=df,hue="Class")

In [None]:
#Correlation matrix
corrmat=df.corr()
f,ax=plt.subplots(figsize=(50,30))
sns.heatmap(corrmat,vmax=.8,square=True,cbar=True,annot=True)

In the HeatMap we can clearly see that most of the features do not correlate to other features but there are some features that either has a positive or a negative correlation with each other. For example, V2 and V5 are highly negatively correlated with the feature called Amount.

In [None]:
#Dividing the data into inputs parameters and outputs value format
X=df.drop("Class",axis=1)
y=df['Class']

In [None]:
# Using Skicit-learn to split data into training and testing sets 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,f1_score

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
len(X_train)

In [None]:
len(y_test)

In [None]:
#Let us run Logistic regression and evaluate the performance metrics

In [None]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_train,y_train)
pred=log.predict(X_test)
print(accuracy_score(y_test,pred))

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
print(f1_score(y_test,pred))

In [None]:
#Let us run RandomForestClassifier and evaluate the performance metrics

In [None]:
from sklearn.ensemble import RandomForestClassifier
random=RandomForestClassifier()
random.fit(X_train,y_train)
pred1=random.predict(X_test)
print(accuracy_score(y_test,pred1))

In [None]:
print(confusion_matrix(y_test,pred1))

In [None]:
print(f1_score(y_test,pred1))

In [None]:
fraud.shape

In [None]:
normal.shape

In [None]:
from imblearn.under_sampling import NearMiss

In [None]:
nm=NearMiss()
x_us,y_us=nm.fit_sample(X,y)

In [None]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(x_us,y_us)
pred5=log.predict(X_test)
print(confusion_matrix(y_test,pred5))

In [None]:
from sklearn.ensemble import RandomForestClassifier
random=RandomForestClassifier()
random.fit(x_us,y_us)
pred6=random.predict(X_test)
print(confusion_matrix(y_test,pred6))

In [None]:
from imblearn.over_sampling import RandomOverSampler
os=RandomOverSampler()

In [None]:
x_os,y_os=os.fit_sample(X,y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
random=RandomForestClassifier()
random.fit(x_os,y_os)
pred8=random.predict(X_test)
print(confusion_matrix(y_test,pred8))

In [None]:
print(f1_score(y_test,pred8)*100)