In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset
dataset_path='/kaggle/input/fraud-detection/fraudTrain.csv'
fraud_data = pd.read_csv(dataset_path)

# Display basic information about the dataset
print(fraud_data.info())

# Summary statistics for numerical columns
print(fraud_data.describe())

# Check for missing values
print(fraud_data.isnull().sum())

In [None]:
import seaborn as sns


# Explore categorical variables
categorical_columns = ['merchant', 'category', 'gender','city']

for column in categorical_columns:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=fraud_data, x=column, palette='Set3')
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=45)
    plt.show()

# Convert 'trans_date_trans_time' and 'dob' to datetime
fraud_data['trans_date_trans_time'] = pd.to_datetime(fraud_data['trans_date_trans_time'])
fraud_data['dob'] = pd.to_datetime(fraud_data['dob'])

# Time-based analysis
plt.figure(figsize=(12, 5))
fraud_data['trans_date_trans_time'].dt.hour.plot(kind='hist', bins=24, rwidth=0.9, color='skyblue')
plt.title('Hourly Transaction Distribution')
plt.xlabel('Hour of the Day')
plt.ylabel('Frequency')
plt.show()

# Visualize the class distribution
plt.figure(figsize=(6, 4))
fraud_data['is_fraud'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Class Distribution (0: Non-Fraud, 1: Fraud)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

# Geospatial data - Scatter plot of transactions
plt.figure(figsize=(10, 8))
plt.scatter(fraud_data['merch_long'], fraud_data['merch_lat'], c=fraud_data['is_fraud'], cmap='coolwarm', alpha=0.5)
plt.title('Geospatial Distribution of Transactions (Fraud vs. Non-Fraud)')
plt.xlabel('Merchant Longitude')
plt.ylabel('Merchant Latitude')
plt.colorbar(label='0: Non-Fraud, 1: Fraud')
plt.show()

In [None]:
# Feature 1: Transaction Amount Decimal Part
fraud_data['amt_decimal'] = fraud_data['amt'] % 1

# Ensure 'trans_date_trans_time' is treated as a string
fraud_data['trans_date_trans_time'] = fraud_data['trans_date_trans_time'].astype(str)

# Feature 2: Age of Cardholder at the Time of Transaction
fraud_data['transaction_date'] = pd.to_datetime(fraud_data['trans_date_trans_time'].str.split(' ').str[0])
fraud_data['cardholder_age'] = (fraud_data['transaction_date'] - pd.to_datetime(fraud_data['dob'])).dt.days // 365

# Feature 3: Transaction Amount to City Population Ratio
fraud_data['amt_to_city_pop_ratio'] = fraud_data['amt'] / fraud_data['city_pop']

# Display the updated dataset with new features
print(fraud_data[['amt_decimal', 'cardholder_age', 'amt_to_city_pop_ratio']].head())

In [None]:
# Define a list of columns to drop
columns_to_drop = [
    'Unnamed: 0',        # An index or identifier
    'cc_num',            # Masked credit card numbers
    'trans_date_trans_time',  #We have unix_time
    'transaction_date',  # Same as unix_time
    'first',             # First name
    'last',              # Last name
    'street',            # Street address
    'city',              # City (state information is more relevant)
    'state',             # State (zip code and lat/long provide location info)
    'zip',               # Zip code (redundant with lat/long)
    'dob',               # Date of birth (we've calculated cardholder_age)
    'trans_num',         # Transaction number or identifier
]

# Drop the specified columns
fraud_data = fraud_data.drop(columns=columns_to_drop)

# Display the updated dataset
print(fraud_data.head())


In [None]:
# Load your testing dataset
testdataset_path='/kaggle/input/fraud-detection/fraudTest.csv'
testing_data = pd.read_csv(testdataset_path)

# Handle missing values (if any)
testing_data = testing_data.dropna()

In [None]:
# Feature 1: Transaction Amount Decimal Part
testing_data['amt_decimal'] = testing_data['amt'] % 1

# Ensure 'trans_date_trans_time' is treated as a string
testing_data['trans_date_trans_time'] = testing_data['trans_date_trans_time'].astype(str)

# Feature 2: Age of Cardholder at the Time of Transaction
testing_data['transaction_date'] = pd.to_datetime(testing_data['trans_date_trans_time'].str.split(' ').str[0])
testing_data['cardholder_age'] = (testing_data['transaction_date'] - pd.to_datetime(testing_data['dob'])).dt.days // 365

# Feature 3: Transaction Amount to City Population Ratio
testing_data['amt_to_city_pop_ratio'] = testing_data['amt'] / testing_data['city_pop']

# Display the updated dataset with new features
print(testing_data[['amt_decimal', 'cardholder_age', 'amt_to_city_pop_ratio']].head())

In [None]:
# Drop the specified columns
testing_data = testing_data.drop(columns=columns_to_drop)

In [None]:
# Handle missing values (if any)
fraud_data = fraud_data.dropna()

# Encode categorical variables using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

categorical_columns = ['gender', 'merchant', 'category', 'job']
for col in categorical_columns:
    fraud_data[col] = label_encoder.fit_transform(fraud_data[col])
    testing_data[col] = label_encoder.fit_transform(testing_data[col])

In [None]:
from sklearn.preprocessing import StandardScaler

# Split the data into features (X) and the target variable (y)
X_train = fraud_data.drop(columns=['is_fraud'])
y_train = fraud_data['is_fraud']

In [None]:
# Split the testing data into features (X_test) and the target variable (y_test)
X_val = testing_data.drop(columns=['is_fraud'])
y_val = testing_data['is_fraud']

In [None]:
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
     

# Make predictions on the validation data
y_pred = model.predict(X_val)

# Evaluate the model's performance on the validation data
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred)

# Print or store the evaluation results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC Score: {roc_auc}")