In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
credit_card_data = pd.read_csv('creditcard.csv')
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
credit_card_data.sample()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
562,421,-1.397776,0.004573,2.528964,0.648279,-0.196625,0.16319,-0.278296,0.366045,0.435493,...,-0.087222,0.150044,0.322306,0.091753,0.171955,0.509403,0.266858,0.094296,12.41,0


In [4]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    945 non-null    int64  
 1   V1      945 non-null    float64
 2   V2      945 non-null    float64
 3   V3      945 non-null    float64
 4   V4      945 non-null    float64
 5   V5      945 non-null    float64
 6   V6      945 non-null    float64
 7   V7      945 non-null    float64
 8   V8      945 non-null    float64
 9   V9      945 non-null    float64
 10  V10     945 non-null    float64
 11  V11     945 non-null    float64
 12  V12     945 non-null    float64
 13  V13     945 non-null    float64
 14  V14     945 non-null    float64
 15  V15     945 non-null    float64
 16  V16     945 non-null    float64
 17  V17     945 non-null    float64
 18  V18     945 non-null    float64
 19  V19     945 non-null    float64
 20  V20     945 non-null    float64
 21  V21     945 non-null    float64
 22  V2

In [5]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Class
0    943
1      2
Name: count, dtype: int64

In [7]:
legit = credit_card_data[credit_card_data.Class==0]
fraud = credit_card_data[credit_card_data['Class']==1]

In [8]:
fraud['Class']

541    1
623    1
Name: Class, dtype: int64

In [9]:
# statistical measures of the data
legit.Amount.describe()

count     943.000000
mean       65.594698
std       189.025173
min         0.000000
25%         5.680000
50%        17.240000
75%        54.995000
max      3828.040000
Name: Amount, dtype: float64

In [10]:
fraud.Amount.describe()

count      2.000000
mean     264.500000
std      374.059487
min        0.000000
25%      132.250000
50%      264.500000
75%      396.750000
max      529.000000
Name: Amount, dtype: float64

In [11]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,350.108165,-0.16971,0.232219,0.862932,0.250289,-0.026254,0.151634,0.104362,-0.054465,-0.010784,...,0.053844,0.000112,-0.118644,-0.038099,-0.006108,0.111039,0.028232,0.012743,-0.017341,65.594698
1,439.0,-2.677884,-0.602658,-0.260694,3.143275,0.418809,-1.245684,-1.105907,0.661932,-1.520521,...,1.114625,0.589464,0.200214,0.455377,0.013198,0.162159,0.016239,0.004186,-0.053756,264.5


In [12]:
legit_sample = legit.sample(n=492)

In [13]:
new_df = pd.concat([legit_sample,fraud],axis=0)

In [14]:
new_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
476,352,-1.046362,0.720386,1.603413,0.608371,0.460941,-1.423406,0.266079,-0.141048,-0.516793,...,-0.231027,-0.529575,-0.106293,0.704604,-0.349903,0.146041,0.289965,0.159572,2.67,0
836,636,-0.220314,1.371503,0.039843,0.607252,0.524369,-0.520815,1.274831,-0.355195,-0.321178,...,0.045557,0.688990,-0.041787,0.022218,-0.597178,-0.431218,0.514280,0.100229,37.99,0
132,83,-1.864990,0.910874,1.724863,-1.748371,0.578943,-0.832531,1.901440,-1.913986,2.112375,...,-0.318597,0.073323,-0.061693,0.547204,-0.466798,0.408030,-2.377933,-1.255549,7.69,0
938,711,1.341534,-0.066799,-0.118841,-0.455025,-0.030501,-0.172391,-0.182207,0.011360,0.138976,...,-0.141317,-0.437092,-0.103720,-0.777226,0.352175,1.016759,-0.091739,-0.019582,0.77,0
54,37,1.295668,0.341483,0.081505,0.566746,-0.110459,-0.766325,0.073155,-0.168304,0.071837,...,-0.323607,-0.929781,0.063809,-0.193565,0.287574,0.127881,-0.023731,0.025200,0.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,218,-0.391725,1.106355,0.787765,0.510525,0.832968,0.151659,0.588155,0.312914,-0.772829,...,0.122192,0.365018,-0.213950,-0.421854,0.045355,-0.323299,0.080659,0.037889,1.00,0
347,257,-0.599318,0.887525,1.579214,-0.113728,0.304991,-0.182829,0.503722,0.241375,-0.722596,...,-0.137631,-0.524586,-0.057729,-0.058980,-0.320342,0.089180,0.058979,0.098448,1.98,0
804,610,1.125910,-0.287398,0.779265,0.826384,-0.807977,-0.188230,-0.407289,0.010140,0.833075,...,0.068431,0.313584,-0.194061,0.110387,0.522330,0.526339,-0.005287,0.024486,59.90,0
541,406,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1


In [15]:
new_df['Class'].value_counts()

Class
0    492
1      2
Name: count, dtype: int64

In [16]:
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,344.857724,-0.098317,0.258558,0.836088,0.25747,-0.033001,0.126735,0.139807,-0.03556,-0.017896,...,0.042841,-0.014432,-0.125412,-0.039335,-0.004064,0.128124,0.029416,0.014515,-0.004644,62.480305
1,439.0,-2.677884,-0.602658,-0.260694,3.143275,0.418809,-1.245684,-1.105907,0.661932,-1.520521,...,1.114625,0.589464,0.200214,0.455377,0.013198,0.162159,0.016239,0.004186,-0.053756,264.5


In [17]:
X = new_df.drop(columns='Class', axis=1)
Y = new_df['Class']

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [19]:
model=LogisticRegression()

In [20]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  1.0
