In [None]:
# importing dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek


In [None]:
credit_card_data = pd.read_csv('/content/creditcard.csv.zip')

In [None]:
# Feature Engineering
credit_card_data['Hour'] = (credit_card_data['Time'] // 3600) % 24
credit_card_data['Day'] = (credit_card_data['Time'] // (3600*24)) % 7


In [None]:
print(credit_card_data['Hour'])

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
284802    23.0
284803    23.0
284804    23.0
284805    23.0
284806    23.0
Name: Hour, Length: 284807, dtype: float64


In [None]:
# Scaling
scaler = StandardScaler()
credit_card_data['Amount'] = scaler.fit_transform(credit_card_data['Amount'].values.reshape(-1, 1))
credit_card_data['Time'] = scaler.fit_transform(credit_card_data['Time'].values.reshape(-1, 1))

In [None]:
print(credit_card_data['Time'])

0        -1.996583
1        -1.996583
2        -1.996562
3        -1.996562
4        -1.996541
            ...   
284802    1.641931
284803    1.641952
284804    1.641974
284805    1.641974
284806    1.642058
Name: Time, Length: 284807, dtype: float64


In [None]:
print(credit_card_data['Amount'])

0         0.244964
1        -0.342475
2         1.160686
3         0.140534
4        -0.073403
            ...   
284802   -0.350151
284803   -0.254117
284804   -0.081839
284805   -0.313249
284806    0.514355
Name: Amount, Length: 284807, dtype: float64


In [None]:
#print first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Hour,Day
0,-1.996583,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0,0.0,0.0
1,-1.996583,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0,0.0,0.0
2,-1.996562,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0,0.0,0.0
3,-1.996562,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0,0.0,0.0
4,-1.996541,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0,0.0,0.0


In [None]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Hour,Day
284802,1.641931,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,-0.350151,0,23.0,1.0
284803,1.641952,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,-0.254117,0,23.0,1.0
284804,1.641974,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,-0.081839,0,23.0,1.0
284805,1.641974,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,-0.313249,0,23.0,1.0
284806,1.642058,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,0.514355,0,23.0,1.0


In [None]:
#dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 33 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [None]:
#checking number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [None]:
#distribution of legit transaction and fraudlent transactions
#0 represents normal trans and 1 represents fraud
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


In [None]:
#this dataset is highly unbalanced
#separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(284315, 33)
(492, 33)


In [None]:
#statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,-0.000234
std,0.999942
min,-0.353229
25%,-0.33064
50%,-0.265271
75%,-0.045177
max,102.362243


In [None]:
 fraud.Amount.describe()

Unnamed: 0,Amount
count,492.0
mean,0.135382
std,1.026242
min,-0.353229
25%,-0.349231
50%,-0.316247
75%,0.070128
max,8.146182


In [None]:
#compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Hour,Day
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000513,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,-0.000234,14.050623,0.491743
1,-0.296223,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,0.135382,11.646341,0.428862


In [None]:
#build a sample dataset containing similar distribution of normal trans and fraudlent trans
#number of fraudlent trans-->492

legit_sample = legit.sample(n=492)

concatenating 2 dataset

In [None]:
new_dataset = pd.concat([legit_sample , fraud] , axis=0)

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Hour,Day
29216,-1.251827,-0.956571,0.313027,1.23148,-1.76439,-0.763919,-1.033279,0.938177,-0.207896,-1.292504,...,0.088172,0.308212,0.428233,-0.697735,0.326646,0.104017,0.205703,0,9.0,0.0
137916,-0.261579,-3.8663,3.739617,0.532225,2.792778,-1.972938,0.347191,-2.367084,-1.255298,-1.594995,...,0.270681,0.699977,-0.178704,-0.034443,-1.784129,-0.11975,-0.301294,0,22.0,0.0
66323,-0.901697,1.39304,-0.649627,-1.528482,-1.544587,1.625539,3.185696,-1.004029,0.793241,-0.764238,...,-0.070389,0.93132,0.627318,-0.288752,0.019561,0.035935,-0.199303,0,14.0,0.0
6314,-1.839112,-3.693891,-3.337292,2.484324,3.585027,4.846531,-3.220114,-2.193143,0.023881,0.145079,...,0.528167,0.309723,0.552896,0.103563,-0.31588,-0.062272,-0.319885,0,2.0,0.0
139440,-0.245469,-0.379476,0.43903,1.864327,0.08655,0.006005,0.127828,0.569715,-0.058184,-0.060265,...,-0.306828,0.076272,0.046333,0.72581,-0.115203,-0.139725,-0.241963,0,23.0,0.0


In [None]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Hour,Day
279863,1.565196,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,1.206024,1,22.0,1.0
280143,1.569513,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,-0.350191,1,23.0,1.0
280149,1.569597,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,-0.041818,1,23.0,1.0
281144,1.582548,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,0.626302,1,23.0,1.0
281674,1.590592,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,-0.183191,1,23.0,1.0


In [None]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,492


In [None]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Hour,Day
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.072977,-0.083617,0.063172,0.095246,-0.010991,0.045534,-0.020142,-0.059666,0.070026,-0.068681,...,0.015544,0.006994,-0.011954,-0.017898,0.017636,-0.013007,-0.000134,-0.083594,13.902439,0.457317
1,-0.296223,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,0.135382,11.646341,0.428862


Splitting the data into Features and Targets

In [None]:
X = new_dataset.drop(columns = 'Class' , axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
29216  -1.251827 -0.956571  0.313027  1.231480 -1.764390 -0.763919 -1.033279   
137916 -0.261579 -3.866300  3.739617  0.532225  2.792778 -1.972938  0.347191   
66323  -0.901697  1.393040 -0.649627 -1.528482 -1.544587  1.625539  3.185696   
6314   -1.839112 -3.693891 -3.337292  2.484324  3.585027  4.846531 -3.220114   
139440 -0.245469 -0.379476  0.439030  1.864327  0.086550  0.006005  0.127828   
...          ...       ...       ...       ...       ...       ...       ...   
279863  1.565196 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  1.569513  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  1.569597 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  1.582548 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  1.590592  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [None]:
print(Y)

29216     0
137916    0
66323     0
6314      0
139440    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


Split the data into Training data and testing data

In [None]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [None]:
print(X.shape , X_train.shape , X_test.shape)

(984, 32) (787, 32) (197, 32)


Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
#training the logistic regression model with training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [None]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)

In [None]:
print('Accuracy on training data: ' , training_data_accuracy)

Accuracy on training data:  0.9415501905972046


In [None]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction , Y_test)

In [None]:
print('Accuracy score on test data :' , test_data_accuracy)

Accuracy score on test data : 0.9289340101522843


Anomaly Detection

In [None]:
from sklearn.ensemble import IsolationForest
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv('/content/creditcard.csv.zip')


In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

In [48]:
# Isolation Forest
contamination = len(y[y==1])/len(y)
iso_model = IsolationForest(contamination=contamination, random_state=42)
iso_model.fit(X)

In [50]:
y_pred_iso = iso_model.predict(X)
df['Anomaly'] = np.where(y_pred_iso==-1, 1, 0)

In [51]:
import os

# Create the 'data' directory
os.makedirs('data2', exist_ok=True)

df.to_csv('data2/anomaly_results.csv', index=False)

print("Anomaly detection completed and saved in 'data/anomaly_results.csv'!")


Anomaly detection completed and saved in 'data/anomaly_results.csv'!


Evaluate the Model

In [53]:
cm = confusion_matrix(y,df['Anomaly'])
print("\nConfusion Matrix:\n" ,  cm)
print("\nClassification Report:\n", classification_report(y,df['Anomaly']))


Confusion Matrix:
 [[283949    366]
 [   366    126]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    284315
           1       0.26      0.26      0.26       492

    accuracy                           1.00    284807
   macro avg       0.63      0.63      0.63    284807
weighted avg       1.00      1.00      1.00    284807

