In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [2]:

# Load the train and test datasets
train_df = pd.read_csv('fraudTrain.csv')
test_df = pd.read_csv('fraudTest.csv')

# Print the column titles of the train dataset
# print("Column titles of the train dataset:")
# print(train_df.columns)


In [3]:
# Print the column titles of the train dataset
print("Column titles of the train dataset:")
print(train_df.columns)

Column titles of the train dataset:
Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [4]:
# Check for missing values in train and test datasets
print("Missing values in train dataset:\n", train_df.isnull().sum())
print("\nMissing values in test dataset:\n", test_df.isnull().sum())



Missing values in train dataset:
 Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

Missing values in test dataset:
 Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
s

In [5]:
# Convert date columns to datetime
train_df['trans_date_trans_time'] = pd.to_datetime(train_df['trans_date_trans_time'])
test_df['trans_date_trans_time'] = pd.to_datetime(test_df['trans_date_trans_time'])

# Extract features from datetime columns
train_df['trans_hour'] = train_df['trans_date_trans_time'].dt.hour
test_df['trans_hour'] = test_df['trans_date_trans_time'].dt.hour

# Optional: Extract additional temporal features if they don't exist already
if 'trans_day_of_week' not in train_df.columns:
    train_df['trans_day_of_week'] = train_df['trans_date_trans_time'].dt.dayofweek
if 'trans_day_of_week' not in test_df.columns:
    test_df['trans_day_of_week'] = test_df['trans_date_trans_time'].dt.dayofweek

if 'trans_day' not in train_df.columns:
    train_df['trans_day'] = train_df['trans_date_trans_time'].dt.day
if 'trans_day' not in test_df.columns:
    test_df['trans_day'] = test_df['trans_date_trans_time'].dt.day

if 'trans_month' not in train_df.columns:
    train_df['trans_month'] = train_df['trans_date_trans_time'].dt.month
if 'trans_month' not in test_df.columns:
    test_df['trans_month'] = test_df['trans_date_trans_time'].dt.month

if 'trans_year' not in train_df.columns:
    train_df['trans_year'] = train_df['trans_date_trans_time'].dt.year
if 'trans_year' not in test_df.columns:
    test_df['trans_year'] = test_df['trans_date_trans_time'].dt.year

# Print the trans_hour of both datasets
print("Train dataset - trans_hour:")
print(train_df['trans_hour'])

print("\nTest dataset - trans_hour:")
print(test_df['trans_hour'])

# Drop irrelevant columns or personally identifiable information (PII)
columns_to_drop = ['Unnamed: 0', 'first', 'last', 'street', 'dob', 'trans_date_trans_time', 'cc_num', 'unix_time']
existing_columns_to_drop = [col for col in columns_to_drop if col in train_df.columns]

train_df = train_df.drop(columns=existing_columns_to_drop)
test_df = test_df.drop(columns=existing_columns_to_drop)

# Define features and target variable
X_train = train_df.drop(columns=['is_fraud'])
y_train = train_df['is_fraud']

X_test = test_df.drop(columns=['is_fraud'])
y_test = test_df['is_fraud']

# Preprocessing for numerical and categorical data
numeric_features = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'trans_hour', 'trans_day_of_week', 'trans_day', 'trans_month', 'trans_year']
categorical_features = ['merchant', 'category', 'city', 'state', 'zip', 'job', 'gender']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Define the preprocessing pipeline
preprocessor_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply the preprocessing pipeline to the train and test data
X_train_preprocessed = preprocessor_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessor_pipeline.transform(X_test)

print("Preprocessed Train Data:")
print(X_train_preprocessed)

print("Preprocessed Test Data:")
print(X_test_preprocessed)

Train dataset - trans_hour:
0           0
1           0
2           0
3           0
4           0
           ..
1296670    12
1296671    12
1296672    12
1296673    12
1296674    12
Name: trans_hour, Length: 1296675, dtype: int32

Test dataset - trans_hour:
0         12
1         12
2         12
3         12
4         12
          ..
555714    23
555715    23
555716    23
555717    23
555718    23
Name: trans_hour, Length: 555719, dtype: int32
Preprocessed Train Data:
  (0, 0)	-0.4078260743571717
  (0, 1)	-0.48441988949552045
  (0, 2)	0.6576196137901902
  (0, 3)	-0.28258875975169373
  (0, 4)	-0.49435433250750654
  (0, 5)	0.5938638566063196
  (0, 6)	-1.8781451290492637
  (0, 7)	-0.9419748658130739
  (0, 8)	-1.6522577406526238
  (0, 9)	-1.504563517805003
  (0, 10)	-0.634064798227529
  (0, 525)	1.0
  (0, 712)	1.0
  (0, 1244)	1.0
  (0, 1639)	1.0
  (0, 1928)	1.0
  (0, 3003)	1.0
  (0, 3127)	1.0
  (1, 0)	0.23003923433445328
  (1, 1)	2.0391199748640023
  (1, 2)	-2.0338701150480003
  (1, 3)	-0.

In [11]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_preprocessed, y_train)

In [12]:

# Predicting fraud on test data
log_reg_preds = log_reg.predict(X_test_preprocessed)

In [13]:
# Print predictions
print("Logistic Regression Predictions:", log_reg_preds)

Logistic Regression Predictions: [0 0 0 ... 0 0 0]


In [14]:
# Evaluate the models
print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, log_reg_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, log_reg_preds))
print("Classification Report:\n", classification_report(y_test, log_reg_preds))

Logistic Regression Performance:
Accuracy: 0.9957586478058156
Confusion Matrix:
 [[553357    217]
 [  2140      5]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.02      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.51      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719



In [44]:
# Print predictions
print("Logistic Regression Predictions:", log_reg_preds)

Logistic Regression Predictions: [0 0 0 ... 0 0 0]
