***CODSOFT DATASCIENCE INTERNSHIP***<BR>
**TASK5:CREDIT CARD FRAUD DETECTION** <BR>
Build a machine learning model to identify fraudulent credit card
transactions.
Preprocess and normalize the transaction data, handle class
imbalance issues, and split the dataset into training and testing sets.
Train a classification algorithm, such as logistic regression or random
forests, to classify transactions as fraudulent or genuine.
Evaluate the model'
s performance using metrics like precision, recall,
and F1-score, and consider techniques like oversampling or
undersampling for improving results.

In [1]:
# import necessary modules 
import pandas  as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
# load the data set
data = pd.read_csv('creditcard.csv')
  
# print info about columns in the dataframe
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [3]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# drop Time columns as they are not relevant for prediction purpose 
data = data.drop(['Time'], axis = 1)
  
# as you can see there are 492 fraud transactions.
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

This data is known as imbalance data because 0 class is more than other class

In [5]:
data.isnull().sum()


V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
data.duplicated().sum()

9144

In [7]:
data.drop_duplicates(inplace=True)

In [8]:
data.shape

(275663, 30)

In [9]:
data['Class'].value_counts()

0    275190
1       473
Name: Class, dtype: int64

In [10]:
X=data.values[:,:-1]
Y=data.values[:,-1]

In [11]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()

scaler.fit(X)

X= scaler.transform(X)

In [12]:
from sklearn.model_selection import train_test_split
  
# split into 70:30 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, 
                                                    random_state = 10)
  

In [13]:
# logistic regression object
lr = LogisticRegression()
  
# train the model on train set
lr.fit(X_train, Y_train)
  
Y_pred = lr.predict(X_test)

In [14]:
# print classification report
from sklearn.metrics import accuracy_score
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test, Y_pred))

print(accuracy_score(Y_test, Y_pred))

[[82549    11]
 [   55    84]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     82560
         1.0       0.88      0.60      0.72       139

    accuracy                           1.00     82699
   macro avg       0.94      0.80      0.86     82699
weighted avg       1.00      1.00      1.00     82699

0.9992019250535072


# SMOTE

In [15]:
X=data.values[:,:-1]
Y=data.values[:,-1]

In [16]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()

scaler.fit(X)

X= scaler.transform(X)

In [17]:
from sklearn.model_selection import train_test_split
  
# split into 70:30 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, 
                                                    random_state = 10)
  

In [18]:
pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [20]:
print("Before OverSampling, counts of label '1': ", (sum(Y_train == 1)))
print("Before OverSampling, counts of label '0': ", (sum(Y_train == 0)))
  


from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 10,k_neighbors=5)
X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)
  
print('After OverSampling, the shape of train_X: ', (X_train_res.shape))
print('After OverSampling, the shape of train_y: ', (Y_train_res.shape))
  
print("After OverSampling, counts of label '1': ", (sum(Y_train_res == 1)))
print("After OverSampling, counts of label '0': ", (sum(Y_train_res == 0)))

Before OverSampling, counts of label '1':  334
Before OverSampling, counts of label '0':  192630
After OverSampling, the shape of train_X:  (385260, 29)
After OverSampling, the shape of train_y:  (385260,)
After OverSampling, counts of label '1':  192630
After OverSampling, counts of label '0':  192630


In [21]:
# logistic regression object
lr = LogisticRegression()
  
# train the model on train set
lr.fit(X_train_res, Y_train_res)
  
Y_pred = lr.predict(X_test)

In [22]:
# print classification report
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[80455  2105]
 [   15   124]]
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99     82560
         1.0       0.06      0.89      0.10       139

    accuracy                           0.97     82699
   macro avg       0.53      0.93      0.55     82699
weighted avg       1.00      0.97      0.99     82699

0.9743648653550829


# Over sampling

In [23]:
data.Class.value_counts()

0    275190
1       473
Name: Class, dtype: int64

In [24]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = data[data.Class==0]
df_minority = data[data.Class==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=100000,    # to match majority class
                                 random_state=10) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.Class.value_counts()

0    275190
1    100000
Name: Class, dtype: int64

In [25]:
X = df_upsampled.values[:,:-1]
Y = df_upsampled.values[:,-1]



In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X)

X = scaler.transform(X)
#print(X)

In [27]:
from sklearn.model_selection import train_test_split

#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,
                                                    random_state=10)  


In [28]:
# logistic regression object
lr = LogisticRegression()
  
# train the model on train set
lr.fit(X_train, Y_train)
  
Y_pred = lr.predict(X_test)

In [29]:
# print classification report
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test, Y_pred))

[[81974   683]
 [ 3184 26716]]
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98     82657
         1.0       0.98      0.89      0.93     29900

    accuracy                           0.97    112557
   macro avg       0.97      0.94      0.95    112557
weighted avg       0.97      0.97      0.97    112557



# Under sampling

In [30]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = data[data.Class==0]
df_minority = data[data.Class==1]
 
# Upsample minority class
df_majority_upsampled = resample(df_majority, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,    # to match majority class
                                 random_state=10) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority, df_majority_upsampled])
 
# Display new class counts
df_upsampled.Class.value_counts()

0    1000
1     473
Name: Class, dtype: int64

In [31]:
X = df_upsampled.values[:,:-1]
Y = df_upsampled.values[:,-1]

In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X)

X = scaler.transform(X)
#print(X)

In [33]:
from sklearn.model_selection import train_test_split

#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,
                                                    random_state=10)  


In [34]:
# logistic regression object
lr = LogisticRegression()
  
# train the model on train set
lr.fit(X_train, Y_train)
  
Y_pred = lr.predict(X_test)

In [35]:
# print classification report
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[295   4]
 [ 16 127]]
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97       299
         1.0       0.97      0.89      0.93       143

    accuracy                           0.95       442
   macro avg       0.96      0.94      0.95       442
weighted avg       0.96      0.95      0.95       442

0.9547511312217195


## conclusion:
we had done eda and preprocessed the data and then predicted the model found to be baised towards a data ,so we did SMOTE analysis in the data and got an good accuracy of 95.47 % and with a less type 2 error in the model