In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [13]:
train_df = pd.read_csv("/content/fraudTrain.csv" , on_bad_lines='skip')
test_df = pd.read_csv("/content/fraudTest.csv" , on_bad_lines='skip')

In [14]:
train_df.head(5)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149.0,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154.0,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939.0,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0


In [15]:
train_df.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,38933.0,38933.0,38932.0,38932.0,38932.0,38932.0,38932.0,38932.0,38932.0,38932.0,38932.0
mean,19466.0,4.140962e+17,72.120214,48694.978629,38.542191,-90.236719,89343.77,1326370000.0,38.542701,-90.240719,0.00994
std,11239.133352,1.305233e+18,153.93553,27003.486815,5.092785,13.953966,298787.0,566971.2,5.123869,13.970049,0.099206
min,0.0,471656200.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.040141,-166.629875,0.0
25%,9733.0,180046200000000.0,9.7,25526.0,34.6902,-96.8094,743.0,1325897000.0,34.793142,-96.928177,0.0
50%,19466.0,3518759000000000.0,47.97,48088.0,39.3465,-87.4569,2471.0,1326401000.0,39.34959,-87.362955,0.0
75%,29199.0,4635331000000000.0,83.71,72011.0,41.8467,-80.1284,21125.0,1326867000.0,41.927813,-80.153656,0.0
max,38932.0,4.992346e+18,11872.21,99783.0,65.6899,-67.9503,2906700.0,1327330000.0,66.659242,-66.967742,1.0


In [16]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38933 entries, 0 to 38932
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             38933 non-null  int64  
 1   trans_date_trans_time  38933 non-null  object 
 2   cc_num                 38933 non-null  int64  
 3   merchant               38932 non-null  object 
 4   category               38932 non-null  object 
 5   amt                    38932 non-null  float64
 6   first                  38932 non-null  object 
 7   last                   38932 non-null  object 
 8   gender                 38932 non-null  object 
 9   street                 38932 non-null  object 
 10  city                   38932 non-null  object 
 11  state                  38932 non-null  object 
 12  zip                    38932 non-null  float64
 13  lat                    38932 non-null  float64
 14  long                   38932 non-null  float64
 15  ci

In [17]:
#Checking the null value
train_df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
trans_date_trans_time,0
cc_num,0
merchant,1
category,1
amt,1
first,1
last,1
gender,1
street,1


### Handling the null values in data set

In [18]:
# Handle null values (example: fill with mean for numeric, mode for categorical)
for col in train_df.columns:
  if col != "is_fraud":
    if train_df[col].dtype == 'object':
        train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
    elif train_df[col].dtype in ['int64', 'float64']:
        train_df[col] = train_df[col].fillna(train_df[col].mean())
  else:
    train_df[col] = train_df[col].fillna(0)
# Verify that null values have been handled
print("\nNull values after handling:")
print(train_df.isnull().sum())



Null values after handling:
Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [19]:
train_df["is_fraud"].value_counts()

Unnamed: 0_level_0,count
is_fraud,Unnamed: 1_level_1
0.0,38546
1.0,387


The Data set is inbalance

0 - No Fraud
1 - Fraud

In [20]:
train_legid_df = train_df[train_df.is_fraud == 0]
train_fraud_df = train_df[train_df.is_fraud == 1]

In [21]:
print(train_legid_df.shape)
print(train_fraud_df.shape)

(38546, 23)
(387, 23)


In [22]:
train_legid_df.amt.describe()

Unnamed: 0,amt
count,38546.0
mean,67.542486
std,142.35025
min,1.0
25%,9.63
50%,47.61
75%,82.6675
max,11872.21


In [24]:
train_fraud_df.amt.describe()

Unnamed: 0,amt
count,387.0
mean,528.071447
std,394.891985
min,4.5
25%,227.255
50%,367.29
75%,904.93
max,1334.07


AS we have very less no of fraud data point so to make a good model we have to take same number of fraud and legid data point in the data set

In [26]:
legid_sample = train_legid_df.sample(n=492)

In [27]:
new_train_df = pd.concat([legid_sample, train_fraud_df], axis=0)

In [31]:
x_train = new_train_df[['amt', 'zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']]
y_train = new_train_df['is_fraud']

**Logistic Regression Model**

In [32]:
model1 = LogisticRegression()
model1.fit(x_train, y_train)

Now preprocess the testing data

In [33]:
test_df.head(5)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371817000.0,33.986391,-81.200714,0.0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371817000.0,39.450498,-109.960431,0.0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371817000.0,40.49581,-74.196111,0.0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371817000.0,28.812398,-80.883061,0.0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371817000.0,44.959148,-85.884734,0.0


In [34]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42783 entries, 0 to 42782
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             42783 non-null  int64  
 1   trans_date_trans_time  42783 non-null  object 
 2   cc_num                 42783 non-null  int64  
 3   merchant               42783 non-null  object 
 4   category               42783 non-null  object 
 5   amt                    42783 non-null  float64
 6   first                  42783 non-null  object 
 7   last                   42783 non-null  object 
 8   gender                 42783 non-null  object 
 9   street                 42783 non-null  object 
 10  city                   42783 non-null  object 
 11  state                  42783 non-null  object 
 12  zip                    42783 non-null  int64  
 13  lat                    42783 non-null  float64
 14  long                   42783 non-null  float64
 15  ci

In [35]:
test_df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
trans_date_trans_time,0
cc_num,0
merchant,0
category,0
amt,0
first,0
last,0
gender,0
street,0


In [36]:
# Handle null values (example: fill with mean for numeric, mode for categorical)
for col in test_df.columns:
  if col != "is_fraud":
    if test_df[col].dtype == 'object':
        test_df[col] = test_df[col].fillna(test_df[col].mode()[0])
    elif test_df[col].dtype in ['int64', 'float64']:
        test_df[col] = test_df[col].fillna(test_df[col].mean())
  else:
    test_df[col] = test_df[col].fillna(0)
# Verify that null values have been handled
print("\nNull values after handling:")
print(test_df.isnull().sum())



Null values after handling:
Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [37]:
train_df["is_fraud"].value_counts()

Unnamed: 0_level_0,count
is_fraud,Unnamed: 1_level_1
0.0,38546
1.0,387


In [38]:
x_test = test_df[['amt', 'zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']]
y_test = test_df['is_fraud']

In [43]:
predicated_val = model1.predict(x_test)
test_data_accuracy = accuracy_score(predicated_val, y_test)

In [44]:
print(f"The accuracy of our model is {test_data_accuracy}")

The accuracy of our model is 0.960615197625225
