In [51]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from imblearn.over_sampling import SMOTE

In [53]:
train_data = pd.read_csv('/content/drive/MyDrive/anyas-gojo-revival-bondman/trainData.csv')
test_data = pd.read_csv('/content/drive/MyDrive/anyas-gojo-revival-bondman/testData.csv')

In [54]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
train_data.head()

Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,1,2019-12-15 10:35:10,3597926034019603,fraud_Ruecker Group,misc_net,7.19,Derrick,Flores,M,83690 Nicholas Ports Apt. 846,Oakland,TN,38060,35.2229,-89.5518,9496,Furniture conservator/restorer,1993-03-23,2642fec2e19c9e18e61fe9a88f6bc79a,1355567710,35.189455,-90.121666,0
1,2,2019-04-10 21:49:53,4873783502705038,fraud_Lynch Ltd,shopping_pos,6.32,Elizabeth,Maxwell,F,194 Goodman Fall Apt. 569,Burlington,WA,98233,48.4786,-122.3345,14871,Public house manager,1974-03-10,61cec15bdbd3db31b12f886e8e181380,1334094593,49.004316,-122.745016,0
2,3,2020-02-16 10:56:18,213161869125933,fraud_Bauch-Raynor,grocery_pos,147.59,Monica,Lane,F,3270 Scott Islands,East Andover,ME,4226,44.6084,-70.6993,190,Animal nutritionist,1970-04-17,c8fcf575540e37ce0822cd040148ff57,1361012178,44.455449,-70.809148,0
3,4,2020-04-30 16:14:16,374656033243756,"fraud_Effertz, Welch and Schowalter",entertainment,32.4,David,Lewis,M,1499 Michael Rue,Arlington,VA,22213,38.8954,-77.1633,207410,Mudlogger,1984-07-03,9e9cde0e0963ac746a2cc9938e171c9a,1367338456,39.476941,-77.613438,0
4,5,2019-12-05 21:07:59,6011999606625827,fraud_Wilkinson LLC,personal_care,29.79,Ronald,Carson,M,870 Rocha Drive,Harrington Park,NJ,7640,40.9918,-73.98,4664,"Radiographer, diagnostic",1965-06-30,0742473d7f6261ce366b43ece063faf3,1354741679,40.863808,-74.674018,0


In [55]:
# removing id, cc_num, first, last, street, city, zip, city_pop, trans_num, unix_time, merch_lat, merch_long
drops = ['id', 'cc_num', 'first', 'last', 'street', 'city', 'zip', 'city_pop', 'trans_num', 'unix_time', 'merch_lat', 'merch_long']
train_data = train_data.drop(drops, axis = 1)
test_data = test_data.drop(drops, axis = 1)


In [56]:
train_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,state,lat,long,job,dob,is_fraud
0,2019-12-15 10:35:10,fraud_Ruecker Group,misc_net,7.19,M,TN,35.2229,-89.5518,Furniture conservator/restorer,1993-03-23,0
1,2019-04-10 21:49:53,fraud_Lynch Ltd,shopping_pos,6.32,F,WA,48.4786,-122.3345,Public house manager,1974-03-10,0
2,2020-02-16 10:56:18,fraud_Bauch-Raynor,grocery_pos,147.59,F,ME,44.6084,-70.6993,Animal nutritionist,1970-04-17,0
3,2020-04-30 16:14:16,"fraud_Effertz, Welch and Schowalter",entertainment,32.4,M,VA,38.8954,-77.1633,Mudlogger,1984-07-03,0
4,2019-12-05 21:07:59,fraud_Wilkinson LLC,personal_care,29.79,M,NJ,40.9918,-73.98,"Radiographer, diagnostic",1965-06-30,0


In [57]:
train_data['male'] = np.where(train_data.gender == 'M', 1, 0)
test_data['male'] = np.where(test_data.gender == 'M', 1, 0)
test_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,state,lat,long,job,dob,male
0,2020-06-21 12:14:25,fraud_Kirlin and Sons,personal_care,2.86,M,SC,33.9659,-80.9355,Mechanical engineer,1968-03-19,1
1,2020-06-21 12:14:33,fraud_Sporer-Keebler,personal_care,29.84,F,UT,40.3207,-110.436,"Sales professional, IT",1990-01-17,0
2,2020-06-21 12:14:53,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,F,NY,40.6729,-73.5365,"Librarian, public",1970-10-21,0
3,2020-06-21 12:15:15,fraud_Haley Group,misc_pos,60.05,M,FL,28.5697,-80.8191,Set designer,1987-07-25,1
4,2020-06-21 12:15:17,fraud_Johnston-Casper,travel,3.19,M,MI,44.2529,-85.017,Furniture designer,1955-07-06,1


In [58]:
drops = ['gender']
train_data = train_data.drop(drops, axis = 1)
test_data = test_data.drop(drops, axis = 1)


In [59]:
# now looking at the trans_date_time column, we dont want the date, extracting the hour only , but we need to convert this column into date time

train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])
# train_data.info()
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])
# now extracting the time only

train_data['time'] = train_data.trans_date_trans_time.dt.hour
test_data['time'] = test_data.trans_date_trans_time.dt.hour

train_data.head()
test_data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,state,lat,long,job,dob,male,time
0,2020-06-21 12:14:25,fraud_Kirlin and Sons,personal_care,2.86,SC,33.9659,-80.9355,Mechanical engineer,1968-03-19,1,12
1,2020-06-21 12:14:33,fraud_Sporer-Keebler,personal_care,29.84,UT,40.3207,-110.436,"Sales professional, IT",1990-01-17,0,12
2,2020-06-21 12:14:53,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,NY,40.6729,-73.5365,"Librarian, public",1970-10-21,0,12
3,2020-06-21 12:15:15,fraud_Haley Group,misc_pos,60.05,FL,28.5697,-80.8191,Set designer,1987-07-25,1,12
4,2020-06-21 12:15:17,fraud_Johnston-Casper,travel,3.19,MI,44.2529,-85.017,Furniture designer,1955-07-06,1,12


In [60]:
drops = ['trans_date_trans_time']
train_data = train_data.drop(drops, axis = 1)
test_data = test_data.drop(drops, axis = 1)


In [61]:
test_data.head()

Unnamed: 0,merchant,category,amt,state,lat,long,job,dob,male,time
0,fraud_Kirlin and Sons,personal_care,2.86,SC,33.9659,-80.9355,Mechanical engineer,1968-03-19,1,12
1,fraud_Sporer-Keebler,personal_care,29.84,UT,40.3207,-110.436,"Sales professional, IT",1990-01-17,0,12
2,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,NY,40.6729,-73.5365,"Librarian, public",1970-10-21,0,12
3,fraud_Haley Group,misc_pos,60.05,FL,28.5697,-80.8191,Set designer,1987-07-25,1,12
4,fraud_Johnston-Casper,travel,3.19,MI,44.2529,-85.017,Furniture designer,1955-07-06,1,12


In [62]:
# getting age using dob
train_data['dob'] = pd.to_datetime(train_data['dob'], format='%Y-%m-%d')

# Calculate age as of November 13, 2023
reference_date = pd.to_datetime('2023-11-13')
train_data['age'] = (reference_date - train_data['dob']).astype('<m8[Y]')

# getting age using dob
test_data['dob'] = pd.to_datetime(test_data['dob'], format='%Y-%m-%d')

# Calculate age as of November 13, 2023
reference_date = pd.to_datetime('2023-11-13')
test_data['age'] = (reference_date - test_data['dob']).astype('<m8[Y]')

In [63]:
train_data.head()

Unnamed: 0,merchant,category,amt,state,lat,long,job,dob,is_fraud,male,time,age
0,fraud_Ruecker Group,misc_net,7.19,TN,35.2229,-89.5518,Furniture conservator/restorer,1993-03-23,0,1,10,30.0
1,fraud_Lynch Ltd,shopping_pos,6.32,WA,48.4786,-122.3345,Public house manager,1974-03-10,0,0,21,49.0
2,fraud_Bauch-Raynor,grocery_pos,147.59,ME,44.6084,-70.6993,Animal nutritionist,1970-04-17,0,0,10,53.0
3,"fraud_Effertz, Welch and Schowalter",entertainment,32.4,VA,38.8954,-77.1633,Mudlogger,1984-07-03,0,1,16,39.0
4,fraud_Wilkinson LLC,personal_care,29.79,NJ,40.9918,-73.98,"Radiographer, diagnostic",1965-06-30,0,1,21,58.0


In [64]:
drops = ['dob']
train_data = train_data.drop(drops, axis = 1)
test_data = test_data.drop(drops, axis = 1)


In [65]:
train_data.head()


Unnamed: 0,merchant,category,amt,state,lat,long,job,is_fraud,male,time,age
0,fraud_Ruecker Group,misc_net,7.19,TN,35.2229,-89.5518,Furniture conservator/restorer,0,1,10,30.0
1,fraud_Lynch Ltd,shopping_pos,6.32,WA,48.4786,-122.3345,Public house manager,0,0,21,49.0
2,fraud_Bauch-Raynor,grocery_pos,147.59,ME,44.6084,-70.6993,Animal nutritionist,0,0,10,53.0
3,"fraud_Effertz, Welch and Schowalter",entertainment,32.4,VA,38.8954,-77.1633,Mudlogger,0,1,16,39.0
4,fraud_Wilkinson LLC,personal_care,29.79,NJ,40.9918,-73.98,"Radiographer, diagnostic",0,1,21,58.0


In [66]:
print(len(train_data.merchant.value_counts()))
print(len(test_data.merchant.value_counts()))

def isEqual(data1, data2):
  data1 = data1.unique()
  data2 = data2.unique()
  print('length of data 1',len(data1))
  print('length of data 2',len(data2))
  # print(data1[50])
  if(sorted(data1) == sorted(data2)):
    return "yes"
  else:
    return "no"

print(isEqual(train_data.merchant, test_data.merchant))
print(isEqual(train_data.state, test_data.state))

print(isEqual(train_data.category, test_data.category))
print(isEqual(train_data.job, test_data.job))


693
693
length of data 1 693
length of data 2 693
yes
length of data 1 51
length of data 2 50
no
length of data 1 14
length of data 2 14
yes
length of data 1 494
length of data 2 478
no


In [67]:
# removing fraud_ from all the values of merchant column
train_data['merchant'] = train_data['merchant'].str.replace('fraud_', '')
test_data['merchant'] = test_data['merchant'].str.replace('fraud_', '')
test_data.head()

Unnamed: 0,merchant,category,amt,state,lat,long,job,male,time,age
0,Kirlin and Sons,personal_care,2.86,SC,33.9659,-80.9355,Mechanical engineer,1,12,55.0
1,Sporer-Keebler,personal_care,29.84,UT,40.3207,-110.436,"Sales professional, IT",0,12,33.0
2,"Swaniawski, Nitzsche and Welch",health_fitness,41.28,NY,40.6729,-73.5365,"Librarian, public",0,12,53.0
3,Haley Group,misc_pos,60.05,FL,28.5697,-80.8191,Set designer,1,12,36.0
4,Johnston-Casper,travel,3.19,MI,44.2529,-85.017,Furniture designer,1,12,68.0


In [68]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize_and_drop(df, column_name, max_features=50):
    print(f'Vectorizing {column_name} column')
    # Initialize the CountVectorizer
    vectorizer = CountVectorizer(max_features=max_features)

    # Fit and transform the column
    vectors = vectorizer.fit_transform(df[column_name])

    # Convert the vectors to a DataFrame
    vectors_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())

    # Concatenate the original DataFrame with the new vectors
    df = pd.concat([df, vectors_df], axis=1)

    # Drop the original column
    df = df.drop(column_name, axis=1)

    return df

def vectorize_text_columns(df, text_columns, max_features=50):
    for column_name in text_columns:
        df = vectorize_and_drop(df, column_name, max_features)
    return df

# Columns to vectorize
text_columns_to_vectorize = ['merchant', 'state', 'category', 'job']

# Apply vectorization to train_data
train_data = vectorize_text_columns(train_data, text_columns_to_vectorize)

# Apply vectorization to test_data
test_data = vectorize_text_columns(test_data, text_columns_to_vectorize)

# Display the resulting DataFrames
print("Train Data Vectorized:")


print("\nTest Data Vectorized:")
print(test_data.head())
train_data.head()


Vectorizing merchant column
Vectorizing state column
Vectorizing category column
Vectorizing job column
Vectorizing merchant column
Vectorizing state column
Vectorizing category column
Vectorizing job column
Train Data Vectorized:

Test Data Vectorized:
     amt      lat      long  male  time   age  and  bahringer  bauch  \
0   2.86  33.9659  -80.9355     1    12  55.0    1          0      0   
1  29.84  40.3207 -110.4360     0    12  33.0    0          0      0   
2  41.28  40.6729  -73.5365     0    12  53.0    1          0      0   
3  60.05  28.5697  -80.8191     1    12  36.0    0          0      0   
4   3.19  44.2529  -85.0170     1    12  68.0    0          0      0   

   baumbach  bins  boehm  conroy  cruickshank  daugherty  erdman  friesen  \
0         0     0      0       0            0          0       0        0   
1         0     0      0       0            0          0       0        0   
2         0     0      0       0            0          0       0        0   
3    

Unnamed: 0,amt,lat,long,is_fraud,male,time,age,and,bahringer,bauch,baumbach,bins,boehm,conroy,cruickshank,daugherty,erdman,friesen,fritsch,goyette,greenholt,group,gutmann,harris,hauck,hills,hudson,huel,inc,jakubowski,kertzmann,kilback,koepp,koss,kovacek,kutch,lesch,llc,ltd,lynch,morar,plc,pouros,rau,reichert,reilly,romaguera,schaefer,schneider,schoen,schuppe,sons,streich,weimann,welch,witting,yost,ak,al,ar,az,ca,co,ct,dc,fl,ga,hi,ia,id,il,in,ks,ky,la,ma,md,me,mi,mn,mo,ms,mt,nc,nd,ne,nh,nj,nm,nv,ny,oh,ok,or,pa,ri,sc,sd,tn,tx,ut,va,vt,wa,wi,wv,wy,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel,administrator,advertising,adviser,and.1,assistant,audiological,biomedical,chartered,chief,community,consultant,designer,development,editor,education,educational,engineer,executive,film,financial,geologist,geomatics,health,inspector,insurance,it,land,librarian,manager,medical,nurse,officer,production,psychologist,psychotherapist,public,radio,research,researcher,sales,scientist,surveyor,systems,tax,teacher,technologist,television,therapist,video,worker
0,7.19,35.2229,-89.5518,0,1,10,30.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,6.32,48.4786,-122.3345,0,0,21,49.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,147.59,44.6084,-70.6993,0,0,10,53.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,32.4,38.8954,-77.1633,0,1,16,39.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,29.79,40.9918,-73.98,0,1,21,58.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [69]:
print(train_data.shape)
print(test_data.shape)


(523174, 171)
(555719, 170)


In [70]:

# Find columns present in train_data but not in test_data
missing_columns_train = set(train_data.columns) - set(test_data.columns)

# Find columns present in test_data but not in train_data
missing_columns_test = set(test_data.columns) - set(train_data.columns)

print("Columns present in train_data but not in test_data:", missing_columns_train)
print("Columns present in test_data but not in train_data:", missing_columns_test)

drops = ['lesch','fritsch']
train_data = train_data.drop(drops, axis = 1)
drops = ['thiel','kihn']
test_data = test_data.drop(drops, axis = 1)

Columns present in train_data but not in test_data: {'fritsch', 'is_fraud', 'lesch'}
Columns present in test_data but not in train_data: {'thiel', 'kihn'}


because there are so many columns and so many decisions involved we would be using random forest model

In [74]:

# Features and target variable in the training data
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']

# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=130, random_state=42)

# Train the model on the full training data
random_forest_model.fit(X_train, y_train)

# Features in the test data
X_test = test_data  # Replace with your actual test_data

# Make predictions on the test data
y_pred_proba = random_forest_model.predict_proba(X_test)[:, 1]  # Probability of positive class

# Apply a threshold (e.g., 0.4) to convert probabilities to binary predictions
threshold = 0.4
y_pred_binary = (y_pred_proba >= threshold).astype(int)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({
    'id': range(1, len(y_pred_binary) + 1),
    'is_fraud': y_pred_binary
})

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)


In [75]:
y_pred_proba[:10]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00769231, 0.        , 0.        , 0.        , 0.        ])

In [76]:
train_data.is_fraud.value_counts()
predictions_df.is_fraud.value_counts()

0    553652
1      2067
Name: is_fraud, dtype: int64