# CREDIT CARD FRAUD DETECTION

#### Problem Statement

In [31]:
import pandas as pd
import numpy as np

In [32]:
df = pd.read_csv("creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [33]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [68]:
Normal =df[df.Class == 0] # Non Fraud
Normal.shape

(284315, 31)

In [69]:
Fraud =df[df.Class == 1]
Fraud.shape

(492, 31)

In [70]:
Normal.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [71]:
Fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

## Sampling Technique

#### legit_sample = legit.sample(n=492) is used to randomly select 492 rows from the legit DataFrame, which contains non-fraudulent transactions.

#### Context:
#### When dealing with class imbalance (where there are significantly more non-fraudulent transactions than fraudulent ones), sampling techniques are used to balance the dataset. Here, we're sampling non-fraudulent transactions to match the number of fraudulent transactions.

#### Example:
#### Suppose we have 10,000 non-fraudulent transactions and only 492 fraudulent transactions. Training a model on this imbalanced dataset may lead to biased predictions, favoring non-fraudulent transactions

In [72]:
# legit: DataFrame containing non-fraudulent transactions
# Sample 492 random rows from legit

Normal_sample = Normal.sample(n=492)

In [73]:
new_data = pd.concat([Normal_sample,Fraud], axis= 0)
new_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
271483,164594.0,1.926164,-0.001949,-1.915123,0.404778,0.337386,-1.013697,0.248887,-0.246579,0.336277,...,0.255566,0.753094,-0.12468,-0.404528,0.246383,-0.107261,-0.015831,-0.032259,60.37,0
91963,63708.0,-1.459767,-0.664257,1.316152,0.850461,-0.942438,0.159266,-0.694183,0.860396,-1.356843,...,-0.040599,-0.08603,0.36572,0.131276,-0.531571,-0.269827,0.322438,0.038009,135.58,0
202335,134280.0,-1.788082,0.674332,-0.76665,-1.543573,1.74865,3.834361,-0.899494,2.011524,-0.126112,...,0.267804,0.529794,-0.179325,0.742938,0.057672,0.420072,0.248429,0.044292,50.0,0
109706,71501.0,1.071826,-0.672807,0.77715,-0.030407,-0.710537,0.811098,-0.808759,0.396222,1.051854,...,-0.173149,-0.377527,-0.028421,-0.591411,0.090578,0.992967,-0.034881,0.002042,61.16,0
159590,112696.0,2.128765,-0.033321,-1.489893,0.155319,0.463302,-0.512098,0.195884,-0.282758,0.502935,...,-0.344345,-0.809322,0.209151,-1.043522,-0.148999,0.263716,-0.064133,-0.069671,1.29,0


In [74]:
X = new_data.drop(columns='Class', axis=1)
Y = new_data['Class']

In [75]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

In [86]:
from sklearn.model_selection import train_test_split
x_train ,x_test , y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [87]:
x_train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
100623,67571.0,-0.758469,-0.045410,-0.168438,-1.313275,-1.901763,0.739433,3.071892,-0.483422,0.618203,...,-0.032500,0.042619,0.397224,0.072229,-0.242276,0.560916,-0.540955,0.150606,-0.117140,549.06
111700,72331.0,-5.467393,4.520035,-0.956856,-2.375816,-1.511242,-1.500989,0.122696,0.446829,4.373871,...,2.595985,-0.827981,-0.738975,0.306334,0.307665,0.779885,0.746283,1.653435,0.538122,0.77
28618,35081.0,-5.519577,-6.991976,2.697300,-1.394760,5.913509,-4.919245,-4.284688,0.601913,2.312010,...,1.522020,0.677240,0.224681,0.840202,0.059830,0.808195,-0.221072,-0.356809,-0.037947,0.47
192820,129863.0,-0.640591,0.792379,-0.550904,0.440691,0.780644,-0.677800,0.139408,0.441543,-0.130314,...,-0.017457,0.138846,0.341791,-0.185434,0.590150,-0.408645,0.712096,0.044082,0.086625,1.00
261737,160162.0,1.936934,-0.312623,-1.424800,-0.002445,0.747349,0.901657,-0.214479,0.312875,0.419486,...,-0.331915,-0.206495,-0.420270,0.342236,-1.650044,-0.481387,0.312361,-0.029951,-0.088805,1.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24728,33355.0,0.118377,-1.941478,0.228865,0.656357,-1.243353,0.177648,0.182244,0.054827,0.626448,...,0.895189,0.320898,-0.096119,-0.354466,0.186211,-0.137521,0.998363,-0.127213,0.095892,535.00
12436,21804.0,1.046817,-0.145300,1.200105,1.039810,-0.875115,-0.125523,-0.529054,-0.012157,1.684114,...,0.019733,0.017125,0.418177,-0.136465,0.571564,0.423850,0.432941,-0.029142,0.015409,59.90
223618,143456.0,-2.006582,3.676577,-5.463811,7.232058,-1.627859,-0.996755,-4.299833,2.268867,-3.651067,...,0.474414,0.713907,-0.063868,0.167947,-0.449864,0.023702,0.536905,0.485864,-0.042393,1.00
43061,41353.0,-15.020981,8.075240,-16.298091,5.664820,-11.918153,-4.246957,-14.716668,9.435084,-6.795398,...,-0.995787,2.525115,-0.832074,-0.186117,0.429781,0.697103,0.056031,-1.310888,-0.707403,34.12


In [88]:
y_test

247995    1
3459      0
258262    0
238247    0
15451     1
         ..
108258    1
12108     1
99506     1
74496     1
95836     0
Name: Class, Length: 197, dtype: int64

In [89]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [90]:
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [91]:
model.predict(x_test)

array([1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0],
      dtype=int64)

In [99]:
import math
accuracy_tarining = round(model.score(x_train,y_train)* 100,2)
print("Accuracy of Model on Training : ",accuracy_tarining,"%")

Accuracy of Model on Training :  94.79 %


In [100]:
accuracy = round(model.score(x_test,y_test)* 100,2)

In [101]:
print("Accuracy of Model on Testing : ",accuracy,"%")

Accuracy of Model on Testing :  92.89 %
