In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the train and test datasets
train_data = pd.read_csv('fraudTrain.csv')
test_data = pd.read_csv('fraudTest.csv')

In [3]:
# Drop irrelevant features (e.g., transaction number, names, addresses)
train_data = train_data.drop(['trans_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'job', 'dob'], axis=1)
test_data = test_data.drop(['trans_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'job', 'dob'], axis=1)

In [4]:
# Handle missing values (if any)
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [5]:
# Ensure consistency of categories between train and test data
for column in ['category', 'merchant', 'gender']:
    train_categories = set(train_data[column].unique())
    test_data[column] = test_data[column].apply(lambda x: x if x in train_categories else 'unknown')

In [6]:
# Encode categorical variables using pandas' factorize method
for column in ['category', 'merchant', 'gender']:
    train_data[column] = pd.factorize(train_data[column])[0]
    test_data[column] = pd.factorize(test_data[column])[0]

In [7]:
# Convert datetime column to numerical representation
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time']).astype(int) / 10**9
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time']).astype(int) / 10**9


In [8]:
# Separate features and target variable for training data
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']

In [9]:
# Separate features and target variable for test data
X_test = test_data.drop('is_fraud', axis=1)
y_test = test_data['is_fraud']

In [10]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_classifier.fit(X_train_scaled, y_train)

In [12]:
# Make predictions on the test data
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9948602866657015
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    123833
         1.0       0.01      0.00      0.01       493

    accuracy                           0.99    124326
   macro avg       0.50      0.50      0.50    124326
weighted avg       0.99      0.99      0.99    124326



In [13]:
# Make predictions on the test data
predictions = rf_classifier.predict(X_test_scaled)

# Create a DataFrame to store predictions and actual labels
results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

# Write predictions to a CSV file
results.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv")

Predictions saved to predictions.csv
