##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [None]:
#%pip install pandas 
#%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [None]:
# Can have as many cells as you want for code
import pandas as pd

filepath = "./data/catB_train.parquet"
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [None]:
df = pd.read_parquet(filepath)

df

In [None]:
numerical_stats = df.describe()
modes = numerical_stats.mode().iloc[0]
numerical_stats.loc["mode"] = modes
numerical_stats.loc["mode_freq"] = modes / len(df) * 100
numerical_stats

In [None]:
df.describe(include=['object'])

In [None]:
df.info()
# 17992 rows by 304 columns
# 90 numerical
# 214 categorical
# target variable is `f_purchase_lh`

In [None]:
categorical_features = df.select_dtypes(exclude="number").columns
numerical_features = df[df.columns.difference(["f_purchase_lh"])].select_dtypes(include="number").columns

In [None]:
print("categorical_features:", categorical_features)

# n_months_last_bought is ordinal
# race_desc is nominal

In [None]:
import pandas as pd

pd.set_option('display.max_rows', 500)
categorical_features

In [None]:
df["min_occ_date"]

In [None]:
df["n_months_last_bought_lh_e22a6a"].unique()

In [None]:
print("numerical_features:", numerical_features)

In [None]:
numerical_features.shape

In [None]:
df["f_purchase_lh"]
# 7100 1s, 10892 NaN

In [None]:
from matplotlib import pyplot as plt

# plot histograms for numerical columns
df.hist(figsize=(20, 20))
plt.show()

In [None]:
# Find columns with missing values
count_missing_per_col = df.isna().sum()
percent_missing_per_col = df.isna().mean() * 100
cols_with_nan = pd.DataFrame({'count': count_missing_per_col, 'percentage': percent_missing_per_col})
cols_with_nan = cols_with_nan[cols_with_nan['count'] > 0].sort_values(by='count', ascending=False)
print("====Columns with missing values:====")
print("Cols_with_nan shape:", cols_with_nan.shape)
cols_with_nan  # drop cols with over 90% nan, fillna the rest with median values

In [None]:
print("Rows with most NaN values:")

# Count the number of NaN values in each row
nan_count_per_row = df.isna().sum(axis=1)

df_check_nan = df.copy()

# Add a new column to the DataFrame with the count of NaN values per row
df_check_nan['nan_count'] = nan_count_per_row

# Calculate the percentage of NaN values
df_check_nan['nan_percent'] = (nan_count_per_row / len(df.columns)) * 100

# Create a DataFrame with count and percent of missing values
rows_w_missing_data = pd.DataFrame({
    'count': nan_count_per_row,
    'percent': df_check_nan['nan_percent']
})

missing_data_sorted = rows_w_missing_data.sort_values(by=['count'], ascending=False)
missing_data_sorted

In [None]:
# Correlation heatmap
import numpy as np
import seaborn as sns

corr_matrix = df.select_dtypes(include=np.number).corr()
np.fill_diagonal(corr_matrix.values, np.nan)
sns.heatmap(corr_matrix, annot=False, cmap='Blues', fmt='g')
np.where(corr_matrix >= 1)

In [None]:
# Drop columns with more than 90% nan values
cols_to_drop = cols_with_nan[cols_with_nan['percentage'] > 90].index
target = df["f_purchase_lh"]
df = df.drop(columns=cols_to_drop)

In [None]:
# remove nan from target and drop target from df
target = target.fillna(0)

In [None]:
# Fill numerical NaN with median
numerical_columns = df.select_dtypes(include='number').columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())

# Fill categorical NaN with mode
categorical_columns = df.select_dtypes(exclude='number').columns
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])


In [None]:
# drop useless column
df = df.drop(columns=["clntnum"])
df.head()

In [None]:
# drop columns with only 1 unique value
cols_to_drop = []
for col in df.columns:
    if len(df[col].unique()) == 1:
        cols_to_drop.append(col)
df = df.drop(columns=cols_to_drop)

In [None]:
df.shape

In [None]:
# drop columns with high correlation (unclear if we want to do this)
# corr_matrix = df.select_dtypes(include=np.number).corr()
# np.fill_diagonal(corr_matrix.values, np.nan)
# corr_matrix = corr_matrix.abs()
# corr_matrix = corr_matrix[corr_matrix > 0.9]

In [None]:
# drop outliers


In [None]:
# scale numerical columns

In [None]:
# drop other useless columns based on prior knowledge

In [None]:
# encode nominal columns

In [None]:
# encode ordinal columns

In [None]:
# imblearn SMOTE on target due to 10x imbalance

In [None]:
nominal_data = ['race_desc', 'ctrycode_desc', 'clttype', 'stat_flag', 'min_occ_date',
                'cltdob_fix', 'cltsex_fix']

In [None]:
# cast all other categorical columsn to numerical

categorical_columns = df.select_dtypes(include='object').columns
columns_to_convert = [col for col in categorical_columns if col not in nominal_data]

# Convert categorical columns to numerical
df[columns_to_convert] = df[columns_to_convert].astype('category').apply(lambda x: x.cat.codes)



In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
# imblearn SMOTE
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import joblib

from sklearn.metrics import f1_score

class Model:
    def __init__(self, n_splits=3, random_state=42):
        self.n_splits = n_splits
        self.random_state = random_state
        self.models = []
        self.categorical_features = ['race_desc', 'ctrycode_desc', 'clttype', 'stat_flag', 'min_occ_date',
                        'cltdob_fix', 'cltsex_fix']
        self.cat_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        self.numerical_features = df.select_dtypes(include="number").columns

        self.numeric_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(strategy="median")),
                   ("scaler", StandardScaler()),
                   ]
        )

        self.cat_transformer = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OneHotEncoder(handle_unknown="ignore")),
            ]
        )
    
        self.preprocessor = ColumnTransformer(
            transformers=[
                ("cat", self.cat_transformer, self.categorical_features),
                ("num", self.numeric_transformer, self.numerical_features),
                
            ]
        )
    
    
        self.model = Pipeline(
            steps=[
                ("preprocessor", self.preprocessor),
                ("regressor", RandomForestClassifier(n_estimators=10, max_depth=10, random_state=2109))
            ]
        )


    def train(self, X, y, max_depth=10, n_estimators=10):
        self.model.fit(X, y.ravel())

        joblib.dump(self.model, f'random_forest_model_fold_{len(self.models)}.joblib')

    def predict(self, test_X):
        result = self.model.predict(test_X)
        return result


model = Model()

X = df
y = target.values.reshape(-1, 1)
model.train(X, y)
res = model.predict(X)
print(res)

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kfold.split(df):
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    model.predict(X_test, y_test)

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [None]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    df = hidden_data
    
    # Find columns with missing values
    count_missing_per_col = df.isna().sum()
    percent_missing_per_col = df.isna().mean() * 100
    cols_with_nan = pd.DataFrame({'count': count_missing_per_col, 'percentage': percent_missing_per_col})
    cols_with_nan = cols_with_nan[cols_with_nan['count'] > 0].sort_values(by='count', ascending=False)
    cols_with_nan  # drop cols with over 90% nan, fillna the rest with median values

    cols_to_drop = cols_with_nan[cols_with_nan['percentage'] > 90].index
    df = df.drop(columns=cols_to_drop)

    # Fill numerical NaN with median
    numerical_columns = df.select_dtypes(include='number').columns
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())
    
    # Fill categorical NaN with mode
    categorical_columns = df.select_dtypes(exclude='number').columns
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

    df = df.drop(columns=["clntnum"])

    categorical_columns = df.select_dtypes(include='object').columns
    columns_to_convert = [col for col in categorical_columns if col not in nominal_data]
    
    # Convert categorical columns to numerical
    df[columns_to_convert] = df[columns_to_convert].astype('category').apply(lambda x: x.cat.codes)
    
    model_trained = joblib.load("random_forest_model_fold_0.joblib")
    
    res = model_trained.predict(df)
    return res




##### Cell to check testing_hidden_data function

In [183]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
x = testing_hidden_data(test_df)
np.unique(x, return_counts=True)

(array([0., 1.]), array([17987,     5], dtype=int64))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!