In [10]:
import pandas as pd
import re

train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10966 entries, 0 to 10965
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TransactionNumber    10966 non-null  int64  
 1   UserID               10966 non-null  int64  
 2   Age                  10966 non-null  int64  
 3   Gender               10966 non-null  object 
 4   Occupation           10966 non-null  object 
 5   EducationLevel       10966 non-null  object 
 6   MaritalStatus        10966 non-null  object 
 7   NumDependents        10966 non-null  int64  
 8   Income               10966 non-null  object 
 9   Expenditure          10966 non-null  object 
 10  GiftsTransaction     10966 non-null  object 
 11  TransactionDate      10966 non-null  object 
 12  TransactionTime      10966 non-null  object 
 13  TransactionAmount    10966 non-null  object 
 14  MerchantID           10966 non-null  object 
 15  TransactionType      10966 non-null 

In [11]:
train_data['TransactionDate']

0        2023-03-12
1        2023-03-05
2        2023-11-10
3        2023-10-07
4        2023-09-22
            ...    
10961    2023-06-04
10962    2023-05-24
10963    2023-12-26
10964    2023-07-16
10965    2023-12-16
Name: TransactionDate, Length: 10966, dtype: object

# Cleaning

In [12]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy='majority')
X_rus, y_rus = rus.fit_resample(train_data.drop(columns=['IsFraud']), train_data['IsFraud'])

train_df = pd.concat([X_rus, y_rus], axis=1)
train_df.reset_index(drop=True, inplace=True)

In [13]:
gender_mapping = {
    'fem': 'Female',
    'Female': 'Female',
    'she': 'Female',
    'woman': 'Female',
    'he': 'Male',
    'man': 'Male',
    'Male': 'Male',
    'isnotfemale': 'Male',
    'isnotmale': 'Female',
}


location_mapping = {
    'Adelaide': 'Adelaide',
    'adl': 'Adelaide',
    'Adelaide City': 'Adelaide',
    'Adl': 'Adelaide',
    'Bne': 'Brisbane',
    'BNE': 'Brisbane',
    'brisbane': 'Brisbane',
    'Brisbane': 'Brisbane',
    'canberra': 'Canberra',
    'Canberra': 'Canberra',
    'CBR': 'Canberra',
    'Cbr': 'Canberra',
    'c': 'Canberra',
    'darwin': 'Darwin',
    'Darwin': 'Darwin',
    'Drw': 'Darwin',
    'DRW': 'Darwin',
    'Hbt': 'Hobart',
    'HBT': 'Hobart',
    'hobart': 'Hobart',
    'Hobart': 'Hobart',
    'Mel': 'Melbourne',
    'melb': 'Melbourne',
    'Melb': 'Melbourne',
    'melbourne': 'Melbourne',
    'Melbourne': 'Melbourne',
    'Melburne': 'Melbourne',
    'Melburn': 'Melbourne',
    'MLB': 'Melbourne',
    'perth': 'Perth',
    'Perth': 'Perth',
    'PTH': 'Perth',
    'Pth': 'Perth',
    'Syd': 'Sydney',
    'Sydney': 'Sydney',
    'SYD': 'Sydney',
    'sydney': 'Sydney'
}


mobile_mapping = {
    'android': 'Mobile',
    'galaxys7': 'Mobile',
    'iphone 15': 'Mobile',
    'mob': 'Mobile',
    'smartphone': 'Mobile',
    'Mobile': 'Mobile',
}

In [14]:
def age_correction(age):
    if age < 0:
        age = abs(age)
    if age % 1000 == 0:
        age = age/1000

    return age


def standard_time(time_str):
    try:
        if 'AM' in time_str or 'PM' in time_str:
            return pd.to_datetime(time_str, format='%I:%M:%S %p').strftime('%H:%M:%S')
        
        return pd.to_datetime(time_str, format='%H:%M:%S').strftime('%H:%M:%S')
    
    except (ValueError, TypeError):
        try:
            parts = time_str.split('/')
            corrected_time = f'{parts[0]}:{parts[1]}:{parts[2]}'
            return pd.to_datetime(corrected_time, format='%H:%M:%S').strftime('%H:%M:%S')
        except Exception:
            return 'NOPES'


def categorize_time(hour):
    if 0 <= hour < 6:
        return 'Night'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    else:
        return 'Evening'
    

def get_currency_and_symbol(val):
    if 'AUD' in val or 'AU' in val:
        # create a feature for aud
        return re.sub(r'[^\d.]', '', val), 'AUD' 
    elif 'AED' in val:
        value = re.sub(r'[^\d.]', '', val)
        # convert AED to AUD
        return float(value) * 0.35, 'AED'
    elif '£' in val:
        value = re.sub(r'[^\d.]', '', val)
        # convert pounds to AUD
        return float(value) * 1.8, 'GBP'
    elif 'GBP' in val:
        value = re.sub(r'[^\d.]', '', val)
        # convert pounds to AUD
        return float(value) * 1.8, 'GBP'
    else:
        return "Unidentified", "Unknown"


In [15]:
def Cleaning(df):
    
    df['Age'] = df['Age'].apply(age_correction)


    df['TransactionTime'] = df['TransactionTime'].apply(standard_time)
    df['TransactionTime'] = pd.to_datetime(df['TransactionTime'], format='%H:%M:%S').dt.hour
    df['TransactionTime'] = df['TransactionTime'].apply(categorize_time)


    df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], format='%Y-%m-%d')
    df['Date'] = df['TransactionDate'].dt.day   
    df['Month'] = df['TransactionDate'].dt.month  
    df['Weekday'] = df['TransactionDate'].dt.day_name()


    df['Income'], df['Income_Currency'] = zip(*df['Income'].apply(get_currency_and_symbol))
    df['Income'] = pd.to_numeric(df['Income'], errors='coerce')  


    df['Expenditure'], df['Expenditure_Currency'] = zip(*df['Expenditure'].apply(get_currency_and_symbol))
    df['Expenditure'] = pd.to_numeric(df['Expenditure'], errors='coerce')

    df['GiftsTransaction'], df['GiftsTransaction_Currency'] = zip(*df['GiftsTransaction'].apply(get_currency_and_symbol))
    df['GiftsTransaction'] = pd.to_numeric(df['GiftsTransaction'], errors='coerce')

    df['TransactionAmount'], df['TransactionAmount_Currency'] = zip(*df['TransactionAmount'].apply(get_currency_and_symbol))
    df['TransactionAmount'] = pd.to_numeric(df['TransactionAmount'], errors='coerce') 


    # Keep only the domain after @ for EmailDomain
    df['EmailDomain'] = df['EmailDomain'].apply(lambda x: x.split('@')[1])


    df['Gender'] = df['Gender'].map(gender_mapping).fillna('Other')
    df['TransactionLocation'] = df['TransactionLocation'].replace(location_mapping)
    df['DeviceType'] = df['DeviceType'].replace(mobile_mapping)

    return df

In [16]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

nominal_columns = ["Weekday", "MaritalStatus", "TransactionType", "TransactionLocation", "Income_Currency", "Expenditure_Currency", 
                    "GiftsTransaction_Currency", "TransactionAmount_Currency", "TransactionTime","DeviceType", "MerchantID", "EmailDomain"]

occupation_order = [
    "Student",
    "Unemployed",
   "Retired", 
    "Professional",
]

education_order = [
    "High School","Bachelor","Master","PhD"
]


def Encoding(df, drop_id = False):
    df = binary_encode(df, "Gender", "Male")
    df = ordinal_encode(df, "Occupation", occupation_order)
    df = ordinal_encode(df, "EducationLevel", education_order)
    df = binary_encode(df, "Terrorism", True)

    # One hot encoding for the rest
    df = pd.get_dummies(df, columns=nominal_columns, prefix=nominal_columns, dtype=int)

    # Keep only relevant columns after cleaning
    if drop_id:
        columns_to_drop = ["TransactionNumber", "UserID", "TransactionDate"]
    else:
        columns_to_drop = ["UserID", "TransactionDate"]
    
    df = df.drop(columns=columns_to_drop)
    
    return df
    

In [17]:
def PreProcess(df, drop_id=False):
    df_cleaned = Cleaning(df)
    df_encoded = Encoding(df_cleaned, drop_id)
    return df_encoded

train_df_cleaned = PreProcess(train_df, drop_id=True)
test_df_cleaned = PreProcess(test_data)

In [18]:
train_df_cleaned

Unnamed: 0,Age,Gender,Occupation,EducationLevel,NumDependents,Income,Expenditure,GiftsTransaction,TransactionAmount,Latitude,...,MerchantID_M005,MerchantID_M006,MerchantID_M007,MerchantID_M008,EmailDomain_disposable.com,EmailDomain_gmail.com,EmailDomain_outlook.com,EmailDomain_securemail.com,EmailDomain_tempmail.com,EmailDomain_yahoo.com
0,48.0,1,0,1,0,58506.34,30412.630,4254.498,84.4900,-35.473469,...,0,0,0,0,0,1,0,0,0,0
1,31.0,0,3,0,3,45769.93,26967.180,8353.188,194.2900,-35.473469,...,0,0,0,0,0,1,0,0,0,0
2,30.0,1,3,0,2,44592.66,21083.310,3523.770,9.7700,-35.473469,...,0,0,0,0,0,0,0,0,0,1
3,23.0,0,0,1,1,69813.83,23085.530,4322.718,99.7300,-37.020100,...,0,0,0,0,0,0,0,0,1,0
4,44.0,0,2,0,0,71354.33,38010.740,1068.678,159.2900,-19.491411,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7985,30.0,1,3,1,1,143566.54,90449.002,29834.800,99.2915,-37.020100,...,0,0,0,0,0,0,1,0,0,0
7986,35.0,1,3,2,0,29584.49,10238.380,7015.860,747.1700,-37.020100,...,1,0,0,0,0,1,0,0,0,0
7987,18.0,0,3,0,2,80403.31,63429.080,344.178,137.5000,-37.020100,...,0,0,0,0,0,0,0,0,0,1
7988,29.0,0,0,0,4,28654.66,9748.530,2122.938,68.0500,-37.020100,...,0,0,0,0,1,0,0,0,0,0


# Feature Selection

In [19]:
# from sklearn.ensemble import RandomForestClassifier

# # Initialize and train the Random Forest Classifier
# y = train_df_cleaned['IsFraud']
# X = train_df_cleaned.drop('IsFraud', axis=1)

# rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=42)
# rf_classifier.fit(X, y)

# # Get feature importances
# feature_importances = rf_classifier.feature_importances_

# # Create a DataFrame for better visualization
# feature_importances_df = pd.DataFrame({
#     'feature': X.columns,
#     'importance': feature_importances
# })

# # Sort the features by importance
# feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# # Display the sorted feature importances
# print(feature_importances_df)

In [20]:
# feature_importances_df['cumulative_importance'] = feature_importances_df['importance'].cumsum()

# # Determine the threshold for cumulative importance
# threshold = 0.99
# rf_selected_features = feature_importances_df[feature_importances_df['cumulative_importance'] <= threshold]['feature']

# # Display the selected features
# print(f"Selected features (cumulative importance <= {threshold*100}%): {rf_selected_features.tolist()}")
# print(f"Number of selected features: {len(rf_selected_features)}")

In [24]:
# drop na from train_df_cleaned
train_df_cleaned = train_df_cleaned.dropna()

In [28]:
# Assume train_df_cleaned has features like transaction amount, time, etc. 
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=0) 
kmeans.fit(train_df_cleaned)
train_df_cleaned['cluster'] = kmeans.labels_

# Analyze the proportion of fraud in each cluster
for i in range(2):
    cluster_data = train_df_cleaned[train_df_cleaned['cluster'] == i]
    fraud_proportion = cluster_data['IsFraud'].mean()
    print(f"Cluster {i}: Fraud Proportion = {fraud_proportion}")

# ... (Use cluster as a feature in a supervised learning model)


Cluster 0: Fraud Proportion = 0.4998228834573149
Cluster 1: Fraud Proportion = 0.5019421665947346


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cleaned['cluster'] = kmeans.labels_


In [35]:
# Print the centroids
centroids = kmeans.cluster_centers_
# Find the attributes that are most important for the clusters
cluster_0 = centroids[0]
cluster_1 = centroids[1]

# Create a DataFrame for the centroids
centroids_df = pd.DataFrame({
    'feature': train_df_cleaned.columns,
    'cluster_0': cluster_0,
    'cluster_1': cluster_1
})

# Sort the DataFrame by cluster 0
centroids_df = centroids_df.sort_values(by='cluster_0', ascending=False)

# Display the sorted DataFrame
print(centroids_df)

                       feature     cluster_0      cluster_1
5                       Income  54914.120751  110823.415723
6                  Expenditure  28354.819904   61184.261513
7             GiftsTransaction   4481.993777    9020.219195
8            TransactionAmount    336.605685     356.139928
10                   Longitude    141.343404     141.093737
..                         ...           ...            ...
23      MaritalStatus_Divorced      0.055260       0.063876
26       MaritalStatus_Widowed      0.052426       0.054381
64  EmailDomain_securemail.com      0.050301       0.044454
61  EmailDomain_disposable.com      0.048530       0.048770
9                     Latitude    -30.529120     -29.843343

[69 rows x 3 columns]


In [49]:
# Assume train_df_cleaned has features like transaction amount, time, etc. 
from sklearn.cluster import KMeans
import pandas as pd

def cluster_analysis(df, num_clusters, target_col='IsFraud'): 
    kmeans = KMeans(n_clusters=num_clusters, random_state=0) 
    kmeans.fit(df)
    df['cluster'] = kmeans.labels_

    # Analyze the proportion of target_col in each cluster and other features
    cluster_stats = pd.DataFrame() # Initialize an empty DataFrame

    for i in range(num_clusters):
        cluster_data = df[df['cluster'] == i]
        target_proportion = cluster_data[target_col].mean()
        cluster_info = pd.DataFrame({
            'Cluster': [i],
            f'{target_col} Proportion': [target_proportion],
            'Count': [len(cluster_data)]
        })

        # Get the mean of numerical features
        numerical_features = cluster_data.select_dtypes(include=['number']).mean()
        cluster_info = pd.concat([cluster_info, pd.DataFrame(numerical_features).T], axis=1)

        # Get the mode of categorical features (handling empty DataFrames)
        categorical_features = cluster_data.select_dtypes(include=['object', 'category']).mode()
        for col in categorical_features.columns:
            if not categorical_features[col].isnull().all():
                cluster_info[col] = categorical_features[col].iloc[0]

        cluster_stats = pd.concat([cluster_stats, cluster_info], axis=0, ignore_index=True)


    return cluster_stats


# Example usage
cluster_stats = cluster_analysis(train_df_cleaned, num_clusters=2)
# Turn to pandas dataframe
cluster_stats = pd.DataFrame(cluster_stats)
# Display the cluster statistics
cluster_stats
# Save the cluster statistics to a CSV file
# cluster_stats.to_csv('cluster_stats.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cluster'] = kmeans.labels_


Unnamed: 0,Cluster,IsFraud Proportion,Count,Age,Gender,Occupation,EducationLevel,NumDependents,Income,Expenditure,...,MerchantID_M007,MerchantID_M008,EmailDomain_disposable.com,EmailDomain_gmail.com,EmailDomain_outlook.com,EmailDomain_securemail.com,EmailDomain_tempmail.com,EmailDomain_yahoo.com,IsFraud_Cluster,cluster
0,0,0.499823,5646,34.69412,0.524088,1.774176,0.874601,1.99752,54914.120751,28354.819904,...,0.126284,0.124867,0.04853,0.391605,0.209883,0.050301,0.099539,0.200142,0.499823,0.0
1,1,0.501942,2317,34.561502,0.507553,1.755287,0.860596,1.968925,110823.415723,61184.261513,...,0.125593,0.12473,0.04877,0.42555,0.197238,0.044454,0.092361,0.191627,0.501942,1.0


In [51]:
import pandas as pd

cluster_0 = cluster_stats[cluster_stats['cluster'] == 0]
cluster_1 = cluster_stats[cluster_stats['cluster'] == 1]

cluster_0_stats = cluster_0.mean()
cluster_1_stats = cluster_1.mean()

# Compare the two clusters by calculating the difference in means
diff_stats = pd.DataFrame({
    'Cluster_0_mean': cluster_0_stats,
    'Cluster_1_mean': cluster_1_stats,
    'Difference': cluster_1_stats - cluster_0_stats
})

# Sort by the absolute difference to find the most significant differences
diff_stats_sorted = diff_stats.reindex(diff_stats['Difference'].abs().sort_values(ascending=False).index)

# Print the main differences
print(diff_stats_sorted.head(10))  # Adjust the number of columns to display the top differences


                   Cluster_0_mean  Cluster_1_mean    Difference
Income               54914.120751   110823.415723  55909.294972
Expenditure          28354.819904    61184.261513  32829.441609
GiftsTransaction      4481.993777     9020.219195   4538.225418
Count                 5646.000000     2317.000000  -3329.000000
TransactionAmount      336.605685      356.139928     19.534243
UserTenure              61.212009       59.602935     -1.609074
Cluster                  0.000000        1.000000      1.000000
cluster                  0.000000        1.000000      1.000000
Latitude               -30.529120      -29.843343      0.685777
Longitude              141.343404      141.093737     -0.249667


# Modelling

In [11]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier

In [12]:
import mlflow
#mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns
mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment("InterUni Datathon")

<Experiment: artifact_location='file:///d:/Projects/InterUniDatathon/mlruns/1', creation_time=1726312537508, experiment_id='1', last_update_time=1726312537508, lifecycle_stage='active', name='InterUni Datathon', tags={}>

In [13]:
y = train_df_cleaned['IsFraud']
X = train_df_cleaned.drop('IsFraud', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
with mlflow.start_run(run_name="Random Forest Classifier"):

    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [10, 20, 30],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4, 10],
        'bootstrap': [True]
    }

    # Initialize the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Initialize the RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                    scoring="f1", cv=10, refit=True, random_state=42, n_jobs=-1)

    # Fit the model
    random_search.fit(X_train, y_train)

    # Get the best parameters
    rf_best = random_search.best_estimator_

    # Make predictions
    y_pred = rf_best.predict(X_test)
    
    # Log model
    mlflow.sklearn.log_model(rf_best, "random_forest_model")
    
    # Log metrics
    rf_f1 = f1_score(y_test, y_pred)
    mlflow.log_metric("f1", rf_f1)
    
    params = random_search.best_params_
    # Log parameters
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('max_depth', params['max_depth'])

2024/09/14 22:34:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest Classifier at: http://127.0.0.1:5000/#/experiments/1/runs/5afe63ffbc594298a8c375a62222c72f.
2024/09/14 22:34:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [None]:
# lgbm_classifier = LGBMClassifier(random_state=42)

# # Define a parameter grid for hyperparameter tuning
# param_grid = {
#     'learning_rate': [0.2, 0.25, 0.3],
#     'max_depth': [5, 8, 10],
#     'num_leaves': [45, 50, 55],
#     'lambda_l1': [1, 1.2, 1.3],
#     'num_iterations': [2000, 5000],
#     'boosting_type': ['dart'],  # 'dart' is recommended for high accuracy
#     'verbosity': [-1] # Setting verbose to -1 suppresses all output.
# }

# # Initialize the RandomizedSearchCV
# random_search = RandomizedSearchCV(estimator=lgbm_classifier , param_distributions=param_grid, 
#                                     cv=5, scoring="f1", refit=True, random_state=42, n_jobs=-1)

# # Train the model using RandomSearchCV
# random_search.fit(X_train, y_train)

# # Get the best model and parameters
# lgbm_best = random_search.best_estimator_
# lgbm_best_params = random_search.best_params_

# # Predict on the test set
# y_pred = lgbm_best.predict(X_test)

# # Calculate metrics
# lgbm_f1 = f1_score(y_test, y_pred)
# print(f"F1-Score on test set: {lgbm_f1:.3f}")

In [32]:
with mlflow.start_run(run_name="XGBoost Classifier"):

    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7, 10]
    }

    # Initialize the XGBoost Regressor
    xg_classifier = XGBClassifier(random_state=42)

    # Initialize the RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=xg_classifier, param_distributions=param_grid, 
                                        cv=5, scoring="f1", refit=True, random_state=42, n_jobs=-1)

    # Fit the model
    random_search.fit(X_train, y_train)

    # Get the best parameters
    xgb_best = random_search.best_estimator_

    # Make predictions
    y_pred = xgb_best.predict(X_test)
    
    # Log model
    mlflow.sklearn.log_model(xgb_best, "xgboost_model")
    
    # Log metrics
    xgb_f1 = f1_score(y_test, y_pred)
    mlflow.log_metric("f1", xgb_f1)
    
    params = random_search.best_params_
    # Log parameters
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])

2024/09/14 22:45:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost Classifier at: http://127.0.0.1:5000/#/experiments/1/runs/db6b4bd79687461ca2c167d2c4d2da1a.
2024/09/14 22:45:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


## Stacking

In [None]:
# lgb = LGBMClassifier(**lgbm_best_params, random_state=42)
# xgb = XGBClassifier(**xgb_best_params, random_state=42)

# # Define estimators with tuned parameters

# estimators = [
#     ('lgbm', lgb),
#     ('xgbr', xgb),
#     ('mlp', MLPRegressor(random_state=42))  # without tuning
# ]

# linear_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('lasso', LassoCV(alphas=np.logspace(-4, 1, 10), random_state=42))
# ])

# # Create the stacking regressor
# stack1 = StackingRegressor(
#     estimators=estimators,
#     final_estimator=linear_pipeline,
#     n_jobs=-1,
#     cv=5)

# stack1.fit(X_train, y_train)

# # Predict on the test set
# predictions = stack1.predict(X_test)

# # Calculate MAPE
# stack1_mape = mean_absolute_percentage_error(y_test, predictions)
# print(f"MAPE on test set: {stack1_mape:.3f}")

# Submission

In [None]:
df_submit = test_df_cleaned.drop(columns=['TransactionNumber'])

In [29]:
output_rf = pd.DataFrame({"TransactionNumber": test_df_cleaned['TransactionNumber'].values, "isFraud": rf_best.predict(df_submit)})
output_rf

Unnamed: 0,TransactionNumber,isFraud
0,11854,0
1,2647,0
2,5945,0
3,6798,1
4,12985,0
...,...,...
7307,2636,1
7308,14557,1
7309,296,0
7310,1864,0


In [34]:
output_xgb = pd.DataFrame({"TransactionNumber": test_df_cleaned['TransactionNumber'].values, "isFraud": xgb_best.predict(df_submit)})
output_xgb

Unnamed: 0,TransactionNumber,isFraud
0,11854,0
1,2647,0
2,5945,0
3,6798,1
4,12985,0
...,...,...
7307,2636,1
7308,14557,1
7309,296,0
7310,1864,0


In [None]:
output_rf.to_csv('submission/output_rf.csv', index=False)
output_xgb.to_csv('submission/output_xgb.csv', index=False)