In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
df1 = pd.read_csv("fraudTrain.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
df1.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [5]:
def preprocess_data(df):
    # Convert 'trans_date_trans_time' to datetime and extract features
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['transaction_hour'] = df['trans_date_trans_time'].dt.hour
    df['transaction_day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    
    # Convert categorical variables to dummy/indicator variables
    df = pd.get_dummies(df, columns=['category', 'gender'])
    
    # Drop unnecessary columns
    df = df.drop(columns=['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant',
                          'first', 'last', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num', 'unix_time'])
    # Scale numerical features
    scaler = StandardScaler()
    numerical_cols = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    return df

In [6]:
df1 = preprocess_data(df1)
df1.head()

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long,is_fraud,transaction_hour,transaction_day_of_week,category_entertainment,...,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M
0,-0.407826,-0.48442,0.65762,-0.282589,-0.494354,0.593864,0,0,1,False,...,False,False,True,False,False,False,False,False,True,False
1,0.230039,2.03912,-2.03387,-0.29367,2.078699,-2.030341,0,0,1,False,...,False,False,False,False,False,False,False,False,True,False
2,0.934149,0.717754,-1.601537,-0.280406,0.902849,-1.592323,0,0,1,True,...,False,False,False,False,False,False,False,False,False,True
3,-0.158132,1.515617,-1.590766,-0.287742,1.662886,-1.621848,0,0,1,False,...,False,False,False,False,False,False,False,False,False,True
4,-0.177094,-0.023035,0.782279,-0.293835,0.026941,0.841909,0,0,1,False,...,False,False,False,True,False,False,False,False,False,True


In [7]:
# Split features and target variable
X = df1.drop(columns=['is_fraud'])
y = df1['is_fraud']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, solver='liblinear'),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

In [10]:
# Define a parameter grid for each model for hyperparameter tuning
param_grids = {
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Decision Tree': {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
}

In [None]:
results = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-score': [], 'ROC-AUC': []}

for name, model in models.items():
    # Use grid search for hyperparameter tuning
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='f1')
    grid_search.fit(X_train, y_train)
    
    # Train the model with the best parameters found by grid search
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = best_model.predict(X_test)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    # Store results
    results['Model'].append(name)
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1-score'].append(f1)
    results['ROC-AUC'].append(roc_auc)

In [None]:
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
best_model = results_df.loc[results_df['F1-score'].idxmax()]['Model']
print(f"\nBest model based on F1-score: {best_model}")