In [4]:
import pandas as pd
import re

train_df = pd.read_csv('data/train.csv')

print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10966 entries, 0 to 10965
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TransactionNumber    10966 non-null  int64  
 1   UserID               10966 non-null  int64  
 2   Age                  10966 non-null  int64  
 3   Gender               10966 non-null  object 
 4   Occupation           10966 non-null  object 
 5   EducationLevel       10966 non-null  object 
 6   MaritalStatus        10966 non-null  object 
 7   NumDependents        10966 non-null  int64  
 8   Income               10966 non-null  object 
 9   Expenditure          10966 non-null  object 
 10  GiftsTransaction     10966 non-null  object 
 11  TransactionDate      10966 non-null  object 
 12  TransactionTime      10966 non-null  object 
 13  TransactionAmount    10966 non-null  object 
 14  MerchantID           10966 non-null  object 
 15  TransactionType      10966 non-null 

# Cleaning

In [6]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='majority')
X_rus, y_rus = rus.fit_resample(train_df.drop(columns=['IsFraud']), train_df['IsFraud'])
train_df = pd.concat([X_rus, y_rus], axis=1)

In [7]:
# Age (does young adult cause more fraud?)
user_info = [
    "Age", 
    "Gender",
    "Terrorism",
    "Income",
    "MaritalStatus",
    "Occupation",
    "EducationLevel",
    "NumDependents",
    "GiftsTransaction" 
]
transaction_info = [
    "TransactionType", 
    "TransactionDate",
    "TransactionTime", 
    "TransactionAmount",
    "TransactionLocation",
    "MerchantID",
    "DeviceType"
]
account_info = [
    "UserTenure",
    "Expenditure",
    "Latitude",
    "Longitude",
    "EmailDomain"
]

In [8]:
def standard_time(time_str):
    try:
        if 'AM' in time_str or 'PM' in time_str:
            return pd.to_datetime(time_str, format='%I:%M:%S %p').strftime('%H:%M:%S')
        
        return pd.to_datetime(time_str, format='%H:%M:%S').strftime('%H:%M:%S')
    
    except (ValueError, TypeError):
        try:
            parts = time_str.split('/')
            corrected_time = f'{parts[0]}:{parts[1]}:{parts[2]}'
            return pd.to_datetime(corrected_time, format='%H:%M:%S').strftime('%H:%M:%S')
        except Exception:
            return 'NOPES'

def categorize_time(hour):
    if 0 <= hour < 6:
        return 'Night'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    else:
        return 'Evening'

train_df['TransactionTime'] = train_df['TransactionTime'].apply(standard_time)
# apply categorize_time function to TransactionTime column
train_df['TransactionTime'] = pd.to_datetime(train_df['TransactionTime'], format='%H:%M:%S').dt.hour
train_df['TransactionTime'] = train_df['TransactionTime'].apply(categorize_time)


train_df['TransactionDate'] = pd.to_datetime(train_df['TransactionDate'], dayfirst=True)

train_df['date'] = train_df['TransactionDate'].dt.day   
train_df['Month'] = train_df['TransactionDate'].dt.month  

train_df['weekday'] = train_df['TransactionDate'].dt.day_name()

  train_df['TransactionDate'] = pd.to_datetime(train_df['TransactionDate'], dayfirst=True)


In [9]:
# Keep only the columns above
train_df = train_df[user_info + transaction_info + account_info + ['IsFraud']]

In [10]:
def get_currency_and_symbol(val):
    if 'AUD' in val or 'AU' in val:
        # create a feature for aud
        return re.sub(r'[^\d.]', '', val), 'AUD' 
    elif 'AED' in val:
        value = re.sub(r'[^\d.]', '', val)
        # convert AED to AUD
        return float(value) * 0.35, 'AED'
    elif '£' in val:
        value = re.sub(r'[^\d.]', '', val)
        # convert pounds to AUD
        return float(value) * 1.8, 'GBP'
    elif 'GBP' in val:
        value = re.sub(r'[^\d.]', '', val)
        # convert pounds to AUD
        return float(value) * 1.8, 'GBP'
    else:
        return "Unidentified", "Unknown" 


# Apply to Income
train_df['Income'], train_df['Income_Currency'] = zip(*train_df['Income'].apply(get_currency_and_symbol))
train_df['Income'] = pd.to_numeric(train_df['Income'], errors='coerce')  

# Apply to Expenditure
train_df['Expenditure'], train_df['Expenditure_Currency'] = zip(*train_df['Expenditure'].apply(get_currency_and_symbol))
train_df['Expenditure'] = pd.to_numeric(train_df['Expenditure'], errors='coerce')

# Apply to GiftsTransaction
train_df['GiftsTransaction'], train_df['GiftsTransaction_Currency'] = zip(*train_df['GiftsTransaction'].apply(get_currency_and_symbol))
train_df['GiftsTransaction'] = pd.to_numeric(train_df['GiftsTransaction'], errors='coerce')

# Apply to TransactionAmount
train_df['TransactionAmount'], train_df['TransactionAmount_Currency'] = zip(*train_df['TransactionAmount'].apply(get_currency_and_symbol))
train_df['TransactionAmount'] = pd.to_numeric(train_df['TransactionAmount'], errors='coerce') 

In [11]:
# Keep only the domain after @ for EmailDomain
train_df['EmailDomain'] = train_df['EmailDomain'].apply(lambda x: x.split('@')[1])

In [12]:
gender_mapping = {
    'fem': 'Female',
    'Female': 'Female',
    'she': 'Female',
    'woman': 'Female',
    'he': 'Male',
    'man': 'Male',
    'Male': 'Male',
    'isnotfemale': 'Male',
    'isnotmale': 'Female',
}

train_df['Gender'] =train_df['Gender'].map(gender_mapping).fillna('Other')

In [13]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df


occupation_order = [
    "Student",
    "Unemployed",
   "Retired", 
    "Professional",
]
education_order = [
    "High School","Bachelor","Master","PhD"
]

train_df = binary_encode(train_df, "Gender", "Male")
train_df = ordinal_encode(train_df, "Occupation", occupation_order)
train_df = ordinal_encode(train_df, "EducationLevel", education_order)
train_df = binary_encode(train_df, "Terrorism", True)

# One hot encoding for the rest
train_df = pd.get_dummies(train_df, columns=["MaritalStatus", "TransactionType", "TransactionLocation"])

columns_to_encode = ["Income_Currency", "Expenditure_Currency", "GiftsTransaction_Currency", "TransactionAmount_Currency", "TransactionTime","DeviceType", "MerchantID", "EmailDomain"]
train_df = pd.get_dummies(train_df, columns=columns_to_encode, prefix=columns_to_encode)



In [15]:
train_df

Unnamed: 0,Age,Gender,Terrorism,Income,Occupation,EducationLevel,NumDependents,GiftsTransaction,TransactionDate,TransactionAmount,UserTenure,Expenditure,Latitude,Longitude,IsFraud,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed,TransactionType_Payment,TransactionType_Purchase,TransactionType_Transfer,TransactionType_Withdrawal,TransactionLocation_Adelaide,TransactionLocation_Adelaide City,TransactionLocation_Adl,TransactionLocation_BNE,TransactionLocation_Bne,TransactionLocation_Brisbane,TransactionLocation_CBR,TransactionLocation_Canberra,TransactionLocation_Cbr,TransactionLocation_DRW,TransactionLocation_Darwin,TransactionLocation_Drw,TransactionLocation_HBT,TransactionLocation_Hbt,TransactionLocation_Hobart,TransactionLocation_MLB,TransactionLocation_Mel,...,TransactionLocation_brisbane,TransactionLocation_canberra,TransactionLocation_darwin,TransactionLocation_hobart,TransactionLocation_melbourne,TransactionLocation_perth,TransactionLocation_sydney,Income_Currency_AUD,Expenditure_Currency_AED,Expenditure_Currency_AUD,GiftsTransaction_Currency_AUD,GiftsTransaction_Currency_GBP,TransactionAmount_Currency_AED,TransactionAmount_Currency_AUD,TransactionTime_Afternoon,TransactionTime_Evening,TransactionTime_Morning,TransactionTime_Night,DeviceType_Desktop,DeviceType_Mobile,DeviceType_Tablet,DeviceType_android,DeviceType_galaxys7,DeviceType_iphone 15,DeviceType_mob,DeviceType_smartphone,MerchantID_M001,MerchantID_M002,MerchantID_M003,MerchantID_M004,MerchantID_M005,MerchantID_M006,MerchantID_M007,MerchantID_M008,EmailDomain_disposable.com,EmailDomain_gmail.com,EmailDomain_outlook.com,EmailDomain_securemail.com,EmailDomain_tempmail.com,EmailDomain_yahoo.com
10276,40,0,1,93510.58,1,1,3,14488.300,2023-05-31,69.1145,91,38615.668,-31.840233,145.612793,0,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False,True,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
582,29,1,0,67641.30,3,0,0,1185.930,2023-03-04,1593.9700,84,39211.640,-25.042261,117.793221,0,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False
6229,-33,0,0,35880.19,3,2,0,9219.978,2023-05-11,73.5100,66,14230.710,-20.917574,142.702789,0,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False
618,27,0,0,69603.60,3,0,1,16855.020,2023-08-15,360.9100,69,50554.530,-30.000233,136.209152,0,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True,False,True,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True
1996,37,0,0,59926.02,3,1,2,10835.586,2023-03-06,55.8200,35,37549.190,-31.840233,145.612793,0,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,30,1,0,143566.54,3,1,1,29834.800,2023-10-26,99.2915,51,90449.002,-37.020100,144.964600,1,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False,True,False,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
10956,35,1,0,29584.49,3,2,0,7015.860,2023-12-05,747.1700,114,10238.380,-37.020100,144.964600,1,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False
10962,-18,0,0,80403.31,3,0,2,344.178,2023-05-24,137.5000,91,63429.080,-37.020100,144.964600,1,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True
10964,29,0,0,28654.66,0,0,4,2122.938,2023-07-16,68.0500,62,9748.530,-37.020100,144.964600,1,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False


# Modelling

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostRegressor
# from sklearn.svm import SVR
# from sklearn.neural_network import MLPRegressor

import mlflow
# mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# df = pd.read_csv("train.csv")
# df_test = pd.read_csv("test.csv")

In [19]:
# Initialize and train the Random Forest Regressor
y = train_df['IsFraud']
X = train_df.drop(['IsFraud', , axis=1)

rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=42)
rf_classifier.fit(X, y)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame for better visualization
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importances
})

# Sort the features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display the sorted feature importances
print(feature_importances_df)

DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>)

In [None]:
feature_importances_df['cumulative_importance'] = feature_importances_df['importance'].cumsum()

# Determine the threshold for cumulative importance
threshold = 0.975
rf_selected_features = feature_importances_df[feature_importances_df['cumulative_importance'] <= threshold]['feature']

# Display the selected features
print(f"Selected features (cumulative importance <= {threshold*100}%): {rf_selected_features.tolist()}")

In [None]:
mlflow.set_experiment("InterUni Datathon")

In [None]:
with mlflow.start_run(run_name="Random Forest Classifier"):

    y = df_train['isFraud']
    X = df_train[rf_selected_features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param_grid = {
        'n_estimators': [100, 200, 500, 1000],
        'max_depth': [10, 20, 30, 40, 50],
    }

    # Initialize the Random Forest Classifier
    rf = RandomForestClassifier(random_state=42)

    # Initialize the RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                    scoring=f1_score, cv=5, refit=True, random_state=42, n_jobs=-1)

    # Fit the model
    random_search.fit(X_train, y_train)

    # Get the best parameters
    model = random_search.best_estimator_

    # Make predictions
    predictions = model.predict(X_test)
    
    # Log model
    mlflow.sklearn.log_model(model, "random_forest_model")
    
    # Log metrics
    accuracy = accuracy_score(y_test, predictions)
    mlflow.log_metric("accuracy", accuracy)
    
    params = random_search.best_params_
    # Log parameters
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('max_depth', params['max_depth'])

In [None]:
lgbm_classifier = LGBMClassifier(random_state=42)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.2, 0.25, 0.3],
    'max_depth': [5, 8, 10],
    'num_leaves': [45, 50, 55],
    'lambda_l1': [1, 1.2, 1.3],
    'num_iterations': [2000, 5000],
    'boosting_type': ['dart'],  # 'dart' is recommended for high accuracy
    'verbosity': [-1] # Setting verbose to -1 suppresses all output.
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=lgbm_classifier , param_distributions=param_grid, 
                                    cv=5, scoring=f1_score, refit=True, random_state=42, n_jobs=-1)

# Train the model using GridSearchCV
random_search.fit(X_train, y_train)

# Get the best model and parameters
lgbm_best = random_search.best_estimator_
lgbm_best_params = random_search.best_params_

# Predict on the test set
y_pred = lgbm_best.predict(X_test)

# Calculate metrics
lgbm_f1 = f1_score(y_test, y_pred)
print(f"F1-Score on test set: {lgbm_f1:.3f}")

In [3]:
with mlflow.start_run(run_name="XGBoost Classifier"):
    y = df_train['isFraud']
    X = df_train[rf_selected_features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param_grid = {
        'n_estimators': [100, 200, 500, 1000],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7, 10]
    }

    # Initialize the XGBoost Regressor
    xg_classifier = XGBClassifier(random_state=42)

    # Initialize the RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=xg_classifier, param_distributions=param_grid, 
                                        cv=5, scoring=f1_score, refit=True, random_state=42, n_jobs=-1)

    # Fit the model
    random_search.fit(X_train, y_train)

    # Get the best parameters
    model = random_search.best_estimator_

    # Make predictions
    predictions = model.predict(X_test)
    
    # Log model
    mlflow.sklearn.log_model(model, "xgboost_model")
    
    # Log metrics
    accuracy = accuracy(y_test, predictions)
    mlflow.log_metric("accuracy", accuracy)
    
    params = random_search.best_params_
    # Log parameters
    mlflow.log_param('n_estimators', params['n_estimators'])
    mlflow.log_param('learning_rate', params['learning_rate'])
    mlflow.log_param('max_depth', params['max_depth'])

NameError: name 'mlflow' is not defined

## Stacking

In [None]:
lgb = LGBMClassifier(**lgbm_best_params, random_state=42)
xgb = XGBClassifier(**xgb_best_params, random_state=42)
# xgb_l = TransformedTargetRegressor(regressor=xgb, func=np.log1p, inverse_func=np.expm1)

In [None]:
# Define estimators with tuned parameters

estimators = [
    ('lgbm', lgb),
    ('xgbr', xgb),
    ('mlp', MLPRegressor(random_state=42))  # without tuning
]

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', LassoCV(alphas=np.logspace(-4, 1, 10), random_state=42))
])

# Create the stacking regressor
stack1 = StackingRegressor(
    estimators=estimators,
    final_estimator=linear_pipeline,
    n_jobs=-1,
    cv=5)

stack1.fit(X_train, y_train)

# Predict on the test set
predictions = stack1.predict(X_test)

# Calculate MAPE
stack1_mape = mean_absolute_percentage_error(y_test, predictions)
print(f"MAPE on test set: {stack1_mape:.3f}")