In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

In [3]:
#Load dataset
df = pd.read_csv("F:/fraudTrain.csv")
print(df.head())


   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [4]:
# Basic info
print(df['is_fraud'].value_counts()) 

is_fraud
0    1289169
1       7506
Name: count, dtype: int64


In [5]:
# Drop unnecessary columns (like transaction_id, name, etc.)
df = df.drop(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'merchant', 'job', 'gender'], axis=1)


In [7]:
print(df.nunique().sort_values(ascending=False))


trans_num     1296675
merch_long    1275745
unix_time     1274823
merch_lat     1247805
amt             52928
long              969
lat               968
city_pop          879
category           14
is_fraud            2
dtype: int64


In [8]:
# Drop high-cardinality and irrelevant columns
df = df.drop(['trans_num', 'merch_long', 'unix_time', 'merch_lat'], axis=1)
# One-hot encode 'category' only
df = pd.get_dummies(df, columns=['category'], drop_first=True)


In [9]:
# Feature and label
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)


In [12]:
# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9969691711492856
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.79      0.65      0.72      1520

    accuracy                           1.00    259335
   macro avg       0.90      0.83      0.86    259335
weighted avg       1.00      1.00      1.00    259335

Confusion Matrix:
 [[257559    256]
 [   530    990]]


In [13]:
print(y.value_counts(normalize=True))


is_fraud
0    0.994211
1    0.005789
Name: proportion, dtype: float64


In [14]:
# Save model
with open("fraud_model.pkl", "wb") as f:
    pickle.dump(model, f)