##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [None]:
#%pip install pandas 
#%pip install matplotlib
%pip install pyarrow
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [4]:
# Can have as many cells as you want for code
import pandas as pd
import numpy as np
import warnings
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [None]:
import pyarrow.parquet as pq

# Read the Parquet file into a PyArrow Table
table = pq.read_table(filepath)

# Convert the PyArrow Table to a Pandas DataFrame if needed
df = table.to_pandas()

print(df.shape)

# Now you can work with the DataFrame or the PyArrow Table as needed
df.head()



In [None]:
df.describe()

## Feature Selection, feature engineering, handling of NA values

In [None]:
new_df = df[["ctrycode_desc", "cltdob_fix", "min_occ_date", "stat_flag", "is_housewife_retiree", "is_sg_pr", "annual_income_est",
             "hh_size_est", "is_consent_to_mail", "is_consent_to_email", "is_consent_to_call", "is_consent_to_sms", "flg_substandard",
            "flg_is_borderline_standard", "flg_is_rental_flat", "flg_has_health_claim", "flg_has_life_claim", "flg_gi_claim",
            "flg_is_proposal", "flg_is_returned_mail",'n_months_last_bought_products', 'flg_latest_being_lapse', 'flg_latest_being_cancel', 
             'tot_inforce_pols', 'tot_cancel_pols', 'f_ever_declined_la', 'f_elx', 'f_mindef_mha', 'f_retail', 'affcon_visit_days', 
             'clmcon_visit_days', "f_purchase_lh"]].copy()

#replacing NaN values
new_df.loc[new_df['tot_cancel_pols'].isna(), 'tot_cancel_pols'] = 0
new_df.loc[new_df['f_ever_declined_la'].isna(), 'f_ever_declined_la'] = 0
new_df.loc[new_df['affcon_visit_days'].isna(), 'affcon_visit_days'] = 0
new_df.loc[new_df['clmcon_visit_days'].isna(), 'clmcon_visit_days'] = 0
new_df.loc[df["f_purchase_lh"].isna(), "f_purchase_lh"] = 0

from datetime import datetime

# Get today's date as a datetime object
today_date = datetime.today()

#Get the ages of the clients
new_df["age"] = (today_date - pd.to_datetime(new_df["cltdob_fix"], errors='coerce')).astype('<m8[Y]') #errors='coerce' will replace invalid parsing or None values with NaT

#Get the number of years since the client's first interaction or policy purchase with the company
new_df["num_years_since_first_interaction"] = (today_date - pd.to_datetime(new_df["min_occ_date"], errors='coerce')).astype('<m8[Y]')

'''
count=1
for column in new_df.columns:
    if new_df[column].isna().sum()==0 and count>=21:
        print(f"na_values in {column}: {new_df[column].isna().sum()}")
        print(f"value_counts in {column}: {new_df[column].value_counts()}")
    count+=1
'''

#Replacing certain values with np.nan
new_df["ctrycode_desc"].replace("Not Applicable", np.nan, inplace=True)

#Replacing the NA values in annual_income_est and hh_size_est columns with the mode as the number of NA values is quite high and removing them will remove quite a lot of rows
new_df["annual_income_est"].replace(np.nan, new_df["annual_income_est"].mode().iloc[0], inplace=True)
new_df["hh_size_est"].replace(np.nan, new_df["hh_size_est"].mode().iloc[0], inplace=True)

#Drop the other NA values
new_df.dropna(inplace=True)

print(new_df.isna().sum().sum())

#Combine is_consent_to_mail, email, call and sms into one column
def combine_communications(mail, email, call, sms):
    results=[]
    for i in range(len(mail)):
        if mail.iloc[i]==0 and email.iloc[i]==0 and call.iloc[i]==0 and sms.iloc[i]==0:
            results.append(0)
        else:
            results.append(1)
    return results

new_df["is_consent_to_communications"] = combine_communications(new_df["is_consent_to_mail"], new_df["is_consent_to_email"], new_df["is_consent_to_call"], new_df["is_consent_to_sms"])    

#Combine flg_has_health_claim, flg_has_life_claim, flg_gi_claim
def combine_claims(health, life, gi):
    results=[]
    for i in range(len(health)):
        if health.iloc[i]==0 and life.iloc[i]==0 and gi.iloc[i]==0:
            results.append(0)
        else:
            results.append(1)
    return results

new_df["flg_has_claims"] = combine_claims(new_df["flg_has_health_claim"], new_df["flg_has_life_claim"], new_df["flg_gi_claim"])

new_df.drop(["cltdob_fix", "min_occ_date", "is_consent_to_mail", "is_consent_to_email", "is_consent_to_call", "is_consent_to_sms", "flg_has_health_claim", "flg_has_life_claim", "flg_gi_claim"], axis=1, inplace=True)

#Changing the column types 
target_dtypes = {"ctrycode_desc": 'category', "stat_flag": 'category', "is_housewife_retiree": 'category', "is_sg_pr": 'category',
                 "annual_income_est": 'category', "hh_size_est" : 'category', "flg_substandard": 'category', "flg_is_borderline_standard": 'category', 
                 "flg_is_rental_flat" : 'category', "flg_is_proposal" : 'category', "flg_is_returned_mail": 'category',
                 "n_months_last_bought_products" : int, "flg_latest_being_lapse" : 'category', "flg_latest_being_cancel":'category', 
                 "tot_inforce_pols": int, "tot_cancel_pols" : int, "f_ever_declined_la" : 'category', "f_elx": 'category',
                 "f_mindef_mha" : 'category', "f_retail" : 'category', "affcon_visit_days" : int, "clmcon_visit_days" : int,
                 "age": int, "num_years_since_first_interaction": int, "is_consent_to_communications": 'category',
                 "flg_has_claims" :'category'}

new_df = new_df.apply(lambda x: x.astype(target_dtypes.get(x.name)))
new_df = new_df.reset_index(drop=True)
new_df.head()


In [None]:
print(new_df[new_df["is_housewife_retiree"].isna()]["is_sg_pr"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["is_consent_to_mail"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["is_consent_to_email"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["is_consent_to_call"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["is_consent_to_sms"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["flg_substandard"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["flg_is_borderline_standard"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["flg_is_rental_flat"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["flg_has_health_claim"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["flg_has_life_claim"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["flg_gi_claim"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["flg_is_proposal"].isna().sum())
print(new_df[new_df["is_housewife_retiree"].isna()]["flg_is_returned_mail"].isna().sum())

#Whenever the value in the column is_housewife_retiree is nan, the values in the columns is_sg_pr, is_consent_to_mail,
#is_consent_to_email, is_consent_to_call, is_consent_to_sms will be nan also. 

print(new_df[new_df["annual_income_est"].isna()]["hh_size_est"].isna().sum())
#Whenever the value in the column annual_income_est is nan, the value in the hh_size_est is nan too.

'''
Thus, the same rows will be removed from the dataset when we remove NA values. 
This ensures that the number of rows removed from the dataset is not too high
so we still retain most information.
'''

## Preprocessing and split dataset into train test


In [None]:
#preparing the target column
y = new_df["f_purchase_lh"].copy()

#One hot encoding of nominal categorical variables
from sklearn.preprocessing import OneHotEncoder
X_nominal_cat = new_df.select_dtypes(include='category')
X_nominal_cat.drop(["annual_income_est", "hh_size_est"], axis=1, inplace=True)
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
one_hot_df = pd.DataFrame(one_hot_encoder.fit_transform(X_nominal_cat), columns=one_hot_encoder.get_feature_names_out())

#Encoding of ordinal categorical variables
from sklearn.preprocessing import LabelEncoder
X_ordinal_cat = new_df[["annual_income_est", "hh_size_est"]].copy()

'''
label_encoder = LabelEncoder()
for column in X_ordinal_cat.columns:
    X_ordinal_cat[column] = label_encoder.fit_transform(X_ordinal_cat[column])
print(X_ordinal_cat)
'''

label_mapping_income = {"A.ABOVE200K": 0, "B.100K-200K": 1, "C.60K-100K": 2, "D.30K-60K": 3, "E.BELOW30K": 4}
X_ordinal_cat["annual_income_est"] = X_ordinal_cat["annual_income_est"].map(label_mapping_income)
X_ordinal_cat["hh_size_est"].replace(">4", 5, inplace=True)

#Scaling of numerical variables
from sklearn.preprocessing import MinMaxScaler
X_numerical = new_df.select_dtypes(include=int)
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(X_numerical), columns=X_numerical.columns)

#Combine the 3 dataframes together
X_preprocessed = pd.concat([one_hot_df, X_ordinal_cat, scaled_df], axis=1, ignore_index=True) #The indexes cannot be different, must reset indexes

#Split into train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

#Address imbalance in dataset
#Note: Important to do this after splitting the dataset, to prevent leaking information to the test dataset
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

 '''
 Reason why we chose oversampler over undersampler and SMOTE:
 1. Since the target column distribution is 4% 1.0 and 96% 0.0, using undersampler will result in 
 great loss of information, which will impact the model performance significantly.
 2. Since there are many categorical variables, we are unable to use SMOTE to increase the number of samples
 for the under represented class.
 '''

## Random Forest Classification (Default)

In [None]:
'''
Reasons why we chose random forest: 
It is a good choice for imbalanced data because ensemble nature helps in capturing complex relationships in the data 
and reduces the risk of overfitting to the majority class. Also, Decision trees, the building blocks of Random Forest, 
are less sensitive to outliers and noise. This can be beneficial when dealing with imbalanced datasets, 
as noisy instances or outliers in the majority class may not heavily influence the overall model.
'''

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test) 
#it's important to use the original, unmodified test data as the purpose of the test set is to 
#simulate real-world performance, and modifying it with oversampling could lead to overly optimistic performance estimates.

# Evaluate the model
from sklearn.metrics import precision_score, recall_score
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

## Random Forest Classification (with hyperparameter tuning)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(class_weight='balanced', random_state=42) 

'''
1. class_weight='balanced': automatically adjusts the weights based on the number of samples in each class
2. limiting the depth of the trees and increasing the minimum number of samples required to split a node can
help prevent overfitting on the majority class
3. Increasing  min_samples_leaf helps in preventing the creation of small leaf nodes that might capture noise.
4. Limiting the number of features considered for each split can add more randomness and reduce overfitting.
5. Increasing the number of trees in the forest can lead to a more stable model (The results will not change that much each time we run the code).
'''

param_grid = {'n_estimators': [80,100,120], 
             'max_depth': [10,15,20],
             'min_samples_split': [5,10,15],
             'min_samples_leaf': [3,5,8],
             'max_features': ['auto', 'sqrt', 'log2']}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring=['recall_weighted', 'precision_weighted'], refit='precision_weighted')
#multi metrics allows you to optimize multiple evaluation metrics simultaneously

# Train the model
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters
print("Best Hyperparameters:", grid_search.best_params_)
best_params = grid_search.best_params_

# Make predictions on the test set
y_pred = grid_search.best_estimator_.predict(X_test) 
# it's important to use the original, unmodified test data as the purpose of the test set is to 
# simulate real-world performance, and modifying it with oversampling could lead to overly optimistic performance estimates.

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


## Use the best parameters from hyperparameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42, max_depth = best_params['max_depth'], 
                                       max_features = best_params['max_features'],
                                       min_samples_leaf = best_params['min_samples_leaf'],
                                      min_samples_split = best_params['min_samples_split'],
                                      n_estimators = best_params['n_estimators'])

# Train the model
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test) 
#it's important to use the original, unmodified test data as the purpose of the test set is to 
#simulate real-world performance, and modifying it with oversampling could lead to overly optimistic performance estimates.

# Evaluate the model
from sklearn.metrics import precision_score, recall_score
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted') #average='weighted' accounts for label imbalance
print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")

# Calculate other metrics
from sklearn.metrics import roc_auc_score, accuracy_score
auc_roc = roc_auc_score(y_test, y_pred)
print(f"auc_roc: {auc_roc:.2f}")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()


In [None]:
#roc curve
from sklearn.metrics import roc_curve, roc_auc_score

# Get probabilities for the positive class
y_probs = rf_classifier.predict_proba(X_test)[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_probs):.2f}')
plt.plot([0, 1], [0, 1], '--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Random Forest')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Get feature importances from the trained model
feature_importances = rf_classifier.feature_importances_

# Sort features by importance
sorted_indices = np.argsort(feature_importances)[::-1]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(X_train.shape[1]), feature_importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importance')
plt.show()

In [None]:
#for plotting the tree
from sklearn.tree import plot_tree

# Plot a single decision tree from the Random Forest
plt.figure(figsize=(20, 10))
plot_tree(rf_classifier.estimators_[0], feature_names=X_train.columns, filled=True, rounded=True)
plt.show()

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [10]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    import pandas as pd
    import numpy as np
    new_df = hidden_data[["ctrycode_desc", "cltdob_fix", "min_occ_date", "stat_flag", "is_housewife_retiree", "is_sg_pr", "annual_income_est",
             "hh_size_est", "is_consent_to_mail", "is_consent_to_email", "is_consent_to_call", "is_consent_to_sms", "flg_substandard",
            "flg_is_borderline_standard", "flg_is_rental_flat", "flg_has_health_claim", "flg_has_life_claim", "flg_gi_claim",
            "flg_is_proposal", "flg_is_returned_mail",'n_months_last_bought_products', 'flg_latest_being_lapse', 'flg_latest_being_cancel', 
             'tot_inforce_pols', 'tot_cancel_pols', 'f_ever_declined_la', 'f_elx', 'f_mindef_mha', 'f_retail', 'affcon_visit_days', 
             'clmcon_visit_days', "f_purchase_lh"]].copy()

    #replacing NaN values
    new_df.loc[new_df['tot_cancel_pols'].isna(), 'tot_cancel_pols'] = 0
    new_df.loc[new_df['f_ever_declined_la'].isna(), 'f_ever_declined_la'] = 0
    new_df.loc[new_df['affcon_visit_days'].isna(), 'affcon_visit_days'] = 0
    new_df.loc[new_df['clmcon_visit_days'].isna(), 'clmcon_visit_days'] = 0
    new_df.loc[new_df["f_purchase_lh"].isna(), "f_purchase_lh"] = 0

    from datetime import datetime

    # Get today's date as a datetime object
    today_date = datetime.today()

    #Get the ages of the clients
    new_df["age"] = (today_date - pd.to_datetime(new_df["cltdob_fix"], errors='coerce')).astype('<m8[Y]') #errors='coerce' will replace invalid parsing or None values with NaT

    #Get the number of years since the client's first interaction or policy purchase with the company
    new_df["num_years_since_first_interaction"] = (today_date - pd.to_datetime(new_df["min_occ_date"], errors='coerce')).astype('<m8[Y]')

    #Replacing certain values with np.nan
    new_df["ctrycode_desc"].replace("Not Applicable", np.nan, inplace=True)

    #Replacing the NA values in annual_income_est and hh_size_est columns with the mode as the number of NA values is quite high and removing them will remove quite a lot of rows
    new_df["annual_income_est"].replace(np.nan, new_df["annual_income_est"].mode().iloc[0], inplace=True)
    new_df["hh_size_est"].replace(np.nan, new_df["hh_size_est"].mode().iloc[0], inplace=True)

    #Drop the other NA values
    new_df.dropna(inplace=True)

    #Combine is_consent_to_mail, email, call and sms into one column
    def combine_communications(mail, email, call, sms):
        results=[]
        for i in range(len(mail)):
            if mail.iloc[i]==0 and email.iloc[i]==0 and call.iloc[i]==0 and sms.iloc[i]==0:
                results.append(0)
            else:
                results.append(1)
        return results

    new_df["is_consent_to_communications"] = combine_communications(new_df["is_consent_to_mail"], new_df["is_consent_to_email"], new_df["is_consent_to_call"], new_df["is_consent_to_sms"])    

    #Combine flg_has_health_claim, flg_has_life_claim, flg_gi_claim
    def combine_claims(health, life, gi):
        results=[]
        for i in range(len(health)):
            if health.iloc[i]==0 and life.iloc[i]==0 and gi.iloc[i]==0:
                results.append(0)
            else:
                results.append(1)
        return results

    new_df["flg_has_claims"] = combine_claims(new_df["flg_has_health_claim"], new_df["flg_has_life_claim"], new_df["flg_gi_claim"])

    new_df.drop(["cltdob_fix", "min_occ_date", "is_consent_to_mail", "is_consent_to_email", "is_consent_to_call", "is_consent_to_sms", "flg_has_health_claim", "flg_has_life_claim", "flg_gi_claim"], axis=1, inplace=True)

    #Changing the column types 
    target_dtypes = {"ctrycode_desc": 'category', "stat_flag": 'category', "is_housewife_retiree": 'category', "is_sg_pr": 'category',
                     "annual_income_est": 'category', "hh_size_est" : 'category', "flg_substandard": 'category', "flg_is_borderline_standard": 'category', 
                     "flg_is_rental_flat" : 'category', "flg_is_proposal" : 'category', "flg_is_returned_mail": 'category',
                     "n_months_last_bought_products" : int, "flg_latest_being_lapse" : 'category', "flg_latest_being_cancel":'category', 
                     "tot_inforce_pols": int, "tot_cancel_pols" : int, "f_ever_declined_la" : 'category', "f_elx": 'category',
                     "f_mindef_mha" : 'category', "f_retail" : 'category', "affcon_visit_days" : int, "clmcon_visit_days" : int,
                     "age": int, "num_years_since_first_interaction": int, "is_consent_to_communications": 'category',
                     "flg_has_claims" :'category'}

    new_df = new_df.apply(lambda x: x.astype(target_dtypes.get(x.name)))
    new_df = new_df.reset_index(drop=True)
    
    #preparing the target column
    y = new_df["f_purchase_lh"].copy()

    #One hot encoding of nominal categorical variables
    from sklearn.preprocessing import OneHotEncoder
    X_nominal_cat = new_df.select_dtypes(include='category')
    X_nominal_cat.drop(["annual_income_est", "hh_size_est"], axis=1, inplace=True)
    one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
    one_hot_df = pd.DataFrame(one_hot_encoder.fit_transform(X_nominal_cat), columns=one_hot_encoder.get_feature_names_out())

    #Encoding of ordinal categorical variables
    from sklearn.preprocessing import LabelEncoder
    X_ordinal_cat = new_df[["annual_income_est", "hh_size_est"]].copy()
    label_encoder = LabelEncoder()
    for column in X_ordinal_cat.columns:
        X_ordinal_cat[column] = label_encoder.fit_transform(X_ordinal_cat[column])

    #Scaling of numerical variables
    from sklearn.preprocessing import MinMaxScaler
    X_numerical = new_df.select_dtypes(include=int)
    scaler = MinMaxScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(X_numerical), columns=X_numerical.columns)

    #Combine the 3 dataframes together
    X_preprocessed = pd.concat([one_hot_df, X_ordinal_cat, scaled_df], axis=1, ignore_index=True) #The indexes cannot be different, must reset indexes

    #Split into train and test datasets
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

    #Address imbalance in dataset
    #Note: Important to do this after splitting the dataset, to prevent leaking information to the test dataset
    from imblearn.over_sampling import RandomOverSampler
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
    
    from sklearn.ensemble import RandomForestClassifier

    # Create a Random Forest Classifier
    rf_classifier = RandomForestClassifier(random_state=42, max_depth = 20, 
                                           max_features = 'auto',
                                           min_samples_leaf = 3,
                                          min_samples_split = 10,
                                          n_estimators = 120)

    # Train the model
    rf_classifier.fit(X_train_resampled, y_train_resampled)

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test) 
    results = list(y_pred) 
    return results

##### Cell to check testing_hidden_data function

In [11]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
#test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

  warn(


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!