In [3]:


# Import dependencies
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [10]:
#Save the processed data to an SQL database:
import sqlite3

# Create a connection to the database
conn = sqlite3.connect("fraud_detection.db")

# Save the processed data to the database
X_train_df = pd.DataFrame(X_train, columns=X.columns)
y_train_df = pd.DataFrame(y_train, columns=["isFraud"])
X_test_df = pd.DataFrame(X_test, columns=X.columns)
y_test_df = pd.DataFrame(y_test, columns=["isFraud"])

X_train_df.to_sql("X_train", conn, if_exists="replace", index=False)
y_train_df.to_sql("y_train", conn, if_exists="replace", index=False)
X_test_df.to_sql("X_test", conn, if_exists="replace", index=False)
y_test_df.to_sql("y_test", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

In [11]:
# Train and evaluate the RandomForest and XGBoost models:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Train the RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [12]:
# Evaluate the models
y_pred_rf = rf_model.predict(X_test)

In [13]:
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_pred_rf))

Random Forest:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113866
           1       0.94      0.47      0.63      4242

    accuracy                           0.98    118108
   macro avg       0.96      0.74      0.81    118108
weighted avg       0.98      0.98      0.98    118108

Accuracy: 0.9800352220002032
ROC AUC: 0.7362497890665184


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [15]:
# Define the parameter grid for RandomForest
param_grid_rf = {
    'n_estimators': [100, 200],  # Reduced from 3 to 2 options
    'max_depth': [None, 10],  # Reduced from 3 to 2 options
    'min_samples_split': [2, 5],  # Reduced from 3 to 2 options
    'min_samples_leaf': [1, 2]  # Reduced from 3 to 2 options
}

In [16]:
# Create the RandomForest model
rf_model = RandomForestClassifier(random_state=42)

In [17]:
# Instantiate GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, scoring='f1', n_jobs=-1, cv=3, verbose=2)

In [None]:
#Fit the GridSearchCV
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [None]:
# Print the best parameters and the corresponding F1 score
print("Random Forest - Best parameters:", grid_search_rf.best_params_)
print("Random Forest - Best F1 score:", grid_search_rf.best_score_)

In [None]:
# Load cleaned data
clean_train_data = pd.read_csv("Resources/clean_train_data.csv")
clean_test_data = pd.read_csv("Resources/clean_test_data.csv")

In [None]:
X_train = clean_train_data.drop("isFraud", axis=1)
y_train = clean_train_data["isFraud"]
X_test =clean_test_data


In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encode_data(data):
    for column in data.columns:
        if data[column].dtype == "object":
            le = LabelEncoder()
            le.fit(data[column].astype(str))
            data[column] = le.transform(data[column].astype(str))
    return data

X_train = label_encode_data(X_train)
X_test = label_encode_data(X_test)

In [None]:
import sqlite3

conn = sqlite3.connect("fraud_detection.db")

X_train.to_sql("X_train", conn, if_exists="replace", index=False)
y_train.to_sql("y_train", conn, if_exists="replace", index=False)
X_test.to_sql("X_test", conn, if_exists="replace", index=False)

conn.close()

In [None]:
#Split your training data into training and validation sets:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, ran

In [None]:
# Train RandomForest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
# Make predictions
y_train_pred_rf = rf.predict(X_train)
y_val_pred_rf = rf.predict(X_val)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train RandomForest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [None]:
y_pred_rf = rf.predict(X_test)

In [None]:
from xgboost import XGBClassifier

In [None]:
# Train XGBoost
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

In [None]:
# Make predictions

y_pred_xgb = xgb.predict(X_test)

In [None]:
from sqlalchemy import create_engine
# Create an SQLite database
engine = create_engine('postgresql://postgres:3720@localhost:5432/Fraud-detection')
# Connect to the SQLite database
conn = sqlite3.connect("fraud_detection.db")

In [None]:
# Load the data from the SQL database
X_train = pd.read_sql('SELECT * FROM "X_train"', engine)
X_val = pd.read_sql('SELECT * FROM "X_val"', engine)
y_train = pd.read_sql('SELECT * FROM "y_train"', engine)
y_val = pd.read_sql('SELECT * FROM "y_val"', engine)
test_data = pd.read_sql('SELECT * FROM "test_data"', engine)

In [None]:
# Choose a model
model = RandomForestClassifier(random_state=42)

In [None]:
# Train the model
model.fit(X_train, y_train.values.ravel())


In [None]:
#  Validate the model
y_val_pred = model.predict(X_val)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

In [None]:
# Tune the model
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20]
}

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train.values.ravel())

best_model = grid_search.best_estimator_


In [None]:
# Test the model
test_features = test_data.drop(columns=['TransactionID'])
test_pred = best_model.predict(test_features)


In [None]:
# Save the predictions to a CSV file
test_predictions = pd.DataFrame({'TransactionID': test_data['TransactionID'], 'isFraud': test_pred})
test_predictions.to_csv("Resources/test_predictions.csv", index=False)

In [None]:
 # Import necessary libraries
import pandas as pd
import numpy as np
from pathlib import Path
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
import sqlite3

In [None]:
from sqlalchemy import create_engine
# Create an SQLite database
engine = create_engine('postgresql://postgres:3720@localhost:5432/Fraud-detection')
# Connect to the SQLite database
conn = sqlite3.connect("fraud_detection.db")

In [None]:
# Load the cleaned data from the SQLite database
clean_train_data = pd.read_sql('SELECT * FROM clean_train_data', engine)
clean_test_data = pd.read_sql('SELECT * FROM clean_test_data', engine)

In [None]:
# Replace 'Category1', 'Category2', and 'Category3' with the actual categorical column names
categorical_columns = ['Category1', 'Category2', 'Category3']

le = LabelEncoder()

for column in categorical_columns:
    clean_train_data[column] = le.fit_transform(clean_train_data[column])
    clean_test_data[column] = le.transform(clean_test_data[column])

In [None]:
# Separate the target variable (isFraud) from the features
X_train = clean_train_data.drop("isFraud", axis=1)
y_train = clean_train_data["isFraud"]

In [None]:
from sklearn.preprocessing import LabelEncoder

# Replace 'Category1', 'Category2', and 'Category3' with the actual categorical column names
categorical_columns = ['Category1', 'Category2', 'Category3']

le = LabelEncoder()

for column in categorical_columns:
    clean_train_data[column] = le.fit_transform(clean_train_data[column])
    clean_test_data[column] = le.transform(clean_test_data[column])

In [None]:
# Apply feature scaling to normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42)


In [None]:
# Initialize the model
model = XGBClassifier(random_state=42)

In [None]:
# Train the model on the training set
model.fit(X_train_split, y_train_split)



In [None]:
# Evaluate the model on the validation set
y_val_preds = model.predict(X_val_split)
print(classification_report(y_val_split, y_val_preds))
print("ROC AUC score: ", roc_auc_score(y_val_split, y_val_preds))
print("Accuracy score: ", accuracy_score(y_val_split, y_val_preds))

In [None]:
# Load the cleaned test data from the SQLite database
clean_test_data = pd.read_sql('SELECT * FROM clean_test_data', engine)

# Apply feature scaling to the test set
X_test = clean_test_data.drop("TransactionID", axis=1)
X_test_scaled = scaler.transform(X_test)

# Train the model on the full training set
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
test_preds = model.predict_proba(X_test_scaled)[:, 1]

# Create a DataFrame with the predicted probabilities
submission = pd.DataFrame({"TransactionID": clean_test_data["TransactionID"], "isFraud": test_preds})

In [None]:
# Load cleaned data
clean_train_data = pd.read_csv("Resources/clean_train_data.csv")
clean_test_data = pd.read_csv("Resources/clean_test_data.csv")

In [None]:
# Apply label encoding to the categorical columns
le = LabelEncoder()
for col in clean_train_data.select_dtypes(include='object').columns:
    clean_train_data[col] = le.fit_transform(clean_train_data[col].astype(str))
    # Apply label encoding to the categorical columns
for col in clean_test_data.select_dtypes(include='object').columns:
    clean_test_data[col] = le.fit_transform(clean_test_data[col].astype(str))

In [None]:
# Convert feature names to strings if needed
clean_train_data.columns = clean_train_data.columns.astype(str)
# Convert feature names to strings if needed
clean_test_data.columns = clean_test_data.columns.astype(str)

In [None]:
# Separate features and target
X_train = clean_train_data.drop(columns=['isFraud'])
y_train = clean_train_data['isFraud']

In [None]:
# Load cleaned test data
clean_test_data = pd.read_csv("Resources/clean_test_data.csv")

# Apply label encoding to the categorical columns
for col in clean_test_data.select_dtypes(include='object').columns:
    clean_test_data[col] = le.fit_transform(clean_test_data[col].astype(str))

# Convert feature names to strings if needed
clean_test_data.columns = clean_test_data.columns.astype(str)

# Try to separate features and target in test data
try:
    X_test = clean_test_data.drop(columns=['isFraud'])
    y_test = clean_test_data['isFraud']
except KeyError:
    # Handle case where 'isFraud' column is missing from test data
    print("Warning: 'isFraud' column not found in test data")
    X_test = clean_test_data
    y_test = None

In [None]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Apply the scaler to both train and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test data if 'isFraud' column exists
if y_test is not None:
    y_pred = model.predict(X_test)

    # Evaluate the model
    print("Accuracy:", accuracy_score(y_test, y_pred))
else:
    print("Warning: no 'isFraud' column found in test data, cannot evaluate model")

In [None]:
# Combine train and test data
combined_data = pd.concat([clean_train_data, clean_test_data], axis=0)

In [None]:
# Apply label encoding to the categorical columns
le = LabelEncoder()
for col in combined_data.select_dtypes(include='object').columns:
    combined_data[col] = le.fit_transform(combined_data[col].astype(str))

In [None]:
# Convert feature names to strings if needed
combined_data.columns = combined_data.columns.astype(str)

In [None]:
# Separate features and target
X = combined_data.drop(columns=['isFraud'])
y = combined_data['isFraud']


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Check for NaN and infinity values in X_train data
print(X_train.isna().sum())
print(X_train.isin([np.nan, np.inf, -np.inf]).sum())



In [None]:
# Fill NaN and infinity values in X_train with a suitable value (if any)
X_train = X_train.fillna(0)
X_train = X_train.replace([np.inf, -np.inf], 0)



In [None]:
# Change the data type to dtype('float64')
# X_train = X_train.astype('float64')

In [None]:
# Check for NaN and infinity values in X_test data
print(X_test.isna().sum())
print(X_test.isin([np.nan, np.inf, -np.inf]).sum())

In [None]:
# Fill NaN and infinity values in X_train with a suitable value (if any)
X_test = X_test.fillna(0)
X_test= X_test.replace([np.inf, -np.inf], 0)

In [None]:
# X_test = X_test.astype('float64')

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Apply the scaler to both train and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Convert the NumPy array to a Pandas DataFrame
# Convert the numpy.ndarray back to a pandas DataFrame
X_train = pd.DataFrame(X_train, columns=X.columns)

X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Connect to the SQL database



In [None]:
# Write the DataFrames to a SQL table
train_table_name = 'train_table'
test_table_name = 'test_table'
X_train.to_sql(train_table_name, engine, if_exists='replace')
X_test.to_sql(test_table_name, engine, if_exists='replace')

In [None]:
# Save target labels for train and test sets
y_train.to_frame('isFraud').to_sql('y_train_table', engine, if_exists='replace')
y_test.to_frame('isFraud').to_sql('y_test_table', engine, if_exists='replace')


## Random forest

In [None]:
# Train an ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=1, n_estimators=50).fit(X_train, y_train)
print("\nExtremely Random Trees Classifier:")
print(f'Training Score: {etc.score(X_train_scaled, y_train)}')
print(f'Testing Score: {etc.score(X_test_scaled, y_test)}')
y_pred_etc = etc.predict(X_test_scaled)
print(classification_report(y_test, y_pred_etc))

In [None]:
# Calculate accuracy
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)

print(f"Train accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")


In [None]:
# Separate features and target in the train and test data
X_train = clean_train_data.drop(columns=['isFraud'])
y_train = clean_train_data['isFraud']

In [None]:
X_test = clean_test_data.drop(columns=['TransactionID']) 

In [None]:
# Apply label encoding to the categorical columns
le = LabelEncoder()
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))


In [None]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")


In [None]:
# Write x_train and x_test  DataFrames to a SQL table
train_table_name = 'train_table'
test_table_name = 'test_table'
X_train.to_sql(train_table_name, engine, if_exists='replace')
X_test.to_sql(test_table_name, engine, if_exists='replace')

In [None]:
# Write the y_train and y_test DataFrames to a SQL table
y_train_table_name = 'y_train_table'
y_test_table_name = 'y_test_table'

In [None]:
y_train.to_sql(y_train_table_name, engine, if_exists='replace')
y_test.to_sql(y_test_table_name, engine, if_exists='replace')

In [None]:
# Connect to the SQL database
engine = create_engine('postgresql://username:password@host:port/database_name')
metadata = MetaData()
metadata.create_all(engine)


#### Step 2: Separate features and target in the train data

In [None]:
# Separate features and target in the train data
X_train = clean_train_data.drop(columns=['isFraud', 'TransactionID'])
y_train = clean_train_data['isFraud']

In [None]:
# Separate features and target in the test data
X_test = clean_test_data.drop(columns=['isFraud', 'TransactionID'])
y_test = clean_test_data['isFraud']

## 5) Label encoding 

In [None]:
def label_encode(df):
    le = LabelEncoder()
    df = df.apply(lambda col: le.fit_transform(col.astype(str)) if col.dtype == 'object' else col, axis=0)
    return df

# Apply label encoding to the train and test data
X_train = label_encode(X_train)
X_test = label_encode(X_test)

#### Step 3: Split the data into training and testing sets

In [None]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [None]:
# Print Shapes
print("Train Dataset shape: ", X.shape)
print("Test Dataset shape: ", y.shape)

In [None]:
# Create a function to apply label encoding to all categorical columns
def label_encode(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            label_encoder = LabelEncoder()
            df[column] = label_encoder.fit_transform(df[column])
    return df

# Apply label encoding to the train and test data
X_train = label_encode(X_train)
X_test = label_encode(X_test)


## 6) Standardize the data

In [None]:
# Standardize the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 7) Connect to SQL database

In [None]:
#Connect to SQL database using a Python SQL library  SQLAlchemy.



In [None]:
# Write the DataFrames to a SQL table
train_table_name = 'train_table'
test_table_name = 'test_table'
X_train.to_sql(train_table_name, engine, if_exists='replace')
X_test.to_sql(test_table_name, engine, if_exists='replace')

##### Step 1:Convert the NumPy array to a Pandas DataFrame

In [None]:
import pandas as pd

# Convert the NumPy array to a Pandas DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Write the DataFrame to a SQL table
train_table_name = 'train_table'
X_train_scaled_df.to_sql(train_table_name, engine, if_exists='replace')

##### Step 2:write test scaled data to sql

In [None]:
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Write the DataFrame to a SQL table
test_table_name = 'test_table'
X_test_scaled_df.to_sql(test_table_name, engine, if_exists='replace')