# 0 Installing required packages

##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [None]:
#%pip install pandas 
#%pip install pyarrow
#%pip install numpy
#%pip install scikit-learn
#%pip install imbalanced-learn
#%pip install matplotlib
#%pip install seaborn
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

# 1 Preparing data

## 1.1 Reading data

In [8]:
import pandas as pd
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file
df = pd.read_parquet(filepath)

## 1.2 Data Cleaning (Null Values)

In [None]:
threshold = 0.6
null_value_proportion = df.isnull().mean()
null_value_columns = null_value_proportion[null_value_proportion > threshold].index.tolist()
null_value_columns.remove("f_purchase_lh")
additional_columns = ["clntnum"] # unique identifier of client will not affect their insurance purchase decisions and also high variance
columns_to_remove = null_value_columns + additional_columns
df = df.drop(columns=columns_to_remove)

## 1.3 Convert to Numerical Data

In [None]:
# Convert clients' DOB
df['cltdob_fix'] = pd.to_datetime(df['cltdob_fix'], errors='coerce').dt.year


# Identify categorical variables
categorical_columns = df.select_dtypes(include=["object"]).columns

In [None]:
# Convert using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

## 1.4 Filling in Missing Data

In [None]:
df.interpolate()

# Target column
df["f_purchase_lh"] = df["f_purchase_lh"].fillna(0)

# Fill in remaining null values with mean
target_column = df["f_purchase_lh"]
features = df.drop(columns="f_purchase_lh")

features = features.fillna(features.mean())

## 1.5 Feature Selection

### 1.5.1 Removing Low-Variance Variables

In [None]:
low_var_columns = []

for c in features.columns:
    if len(features[c].unique())==1:
        low_var_columns += [c]

features = features.drop(columns=low_var_columns)

cleaned_features = features

### 1.5.2 Recursive Feature Elimination (RFE)

In [None]:
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(features, target_column)

importance_scores = decision_tree_model.feature_importances_

importance_scores_features = pd.DataFrame({'Feature': features.columns, 'Importance': importance_scores})
importance_scores_features = importance_scores_features.sort_values(by = 'Importance', ascending =False)

top50_features = importance_scores_features.head(50)

plt.figure(figsize=(12, 8))
plt.barh(top50_features['Feature'], top50_features['Importance'], color='skyblue')
plt.xlabel('Importance_scores')
plt.title('Feature Importances')
plt.show()

## 1.6 Sub-conclusion
After cleaning the data and selecting the top 50 features by their importance scores, we conclude that these features will affect customer satisfaction and conversion rates. 

Hence, Singlife should focus on these variables to enhnace customer experiences and personalise communication.

Moving forward, we will be investigating the different machine learning models in predicting customer satisfaction and conversion rates. 

# 2 Modelling

## 2.1 Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

features_top50 = features[top50_features["Feature"].tolist()]

X_train, X_val, y_train, y_val = train_test_split(features_top50, target_column, test_size=0.2, random_state=0)

## 2.2 KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=2)
knn_clf.fit(X_train, y_train)
print("train accuracy: ", knn_clf.score(X_train, y_train))
print("val accuracy: ", knn_clf.score(X_val, y_val))

### 2.2.1 KNN Model Evaluation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

knn_f1scores = cross_val_score(knn_clf, features_top50, target_column, cv=5, scoring='f1_macro')
knn_f1scores

from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
y_val_pred = knn_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

knn_cm = confusion_matrix(y_val, y_val_pred)
knn_disp = ConfusionMatrixDisplay(confusion_matrix=knn_cm,)
knn_disp.plot()

## 2.3 Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
print("train accuracy: ", dt_clf.score(X_train, y_train))
print("val accuracy: ", dt_clf.score(X_val, y_val))

### 2.3.1 Decision Tree Model Evaluation

In [None]:
dt_f1scores = cross_val_score(dt_clf, features_top50, target_column, cv=5, scoring='f1_macro')
dt_f1scores

y_val_pred = dt_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

dt_cm = confusion_matrix(y_val, y_val_pred)
dt_disp = ConfusionMatrixDisplay(confusion_matrix=dt_cm,)
dt_disp.plot()

## 2.4 Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rndforest_clf = RandomForestClassifier()
rndforest_clf.fit(X_train, y_train)
print("train accuracy: ", rndforest_clf.score(X_train, y_train))
print("val accuracy: ", rndforest_clf.score(X_val, y_val))

### 2.4.1 Random Forest Model Evaluation

In [None]:
rndforest_f1scores = cross_val_score(rndforest_clf, features_top50, target_column, cv=5, scoring='f1_macro')
rndforest_f1scores

y_val_pred = rndforest_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

rndforest_cm = confusion_matrix(y_val, y_val_pred)
rndforest_disp = ConfusionMatrixDisplay(confusion_matrix=rndforest_cm,)
rndforest_disp.plot()

## 2.5 Dense Network Model 

In [None]:
from sklearn.neural_network import MLPClassifier
dn_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=0)
dn_clf.fit(X_train, y_train)
print("train accuracy: ", dn_clf.score(X_train, y_train))
print("val accuracy: ", dn_clf.score(X_val, y_val))

### 2.5.1 Dense Network Model Evaluation 

In [None]:
dn_f1scores = cross_val_score(dn_clf, features_top50, target_column, cv=5, scoring='f1_macro')
dn_f1scores

y_val_pred = dn_clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

dn_cm = confusion_matrix(y_val, y_val_pred)
dn_disp = ConfusionMatrixDisplay(confusion_matrix=dn_cm,)
dn_disp.plot()

# 3 Conclusion & Final Model

We found the Decision tree model to perform the best with the highest F1 score. Hence, it will be our final model.

## 3.1 train_model Function
This function takes in a data set and trains the Decision tree model on that data set. 
We will call this function in the `testing_hidden_data` function. 

In [None]:
def train_model(features_training_set, target_training_set):
    dt_model = DecisionTreeClassifier()

    dt_model.fit(features_training_set, target_training_set)

    return dt_model

## 3.2 load_model Function
This function takes in a data set and make predictions using the Decision tree model from 2.3 that was trained using the data sent to us.

This input data set must be cleaned first (i.e. removing NAs, converting to numerical data, filling in NAs) and the top 50 features must be selected. All these follows the same process from 1.1 to 1.5. 

In [None]:
def load_model(features_dataset):
    return [dt_clf.predict(features_dataset)]

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [14]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''

    threshold = 0.6
    null_value_proportion = hidden_data.isnull().mean()
    null_value_columns = null_value_proportion[null_value_proportion > threshold].index.tolist()
    additional_columns = ["clntnum"] # unique identifier of client will not affect their insurance purchase decisions and also high variance
    columns_to_remove = null_value_columns + additional_columns
    hidden_data = hidden_data.drop(columns=columns_to_remove)

    # Convert clients' DOB
    hidden_data['cltdob_fix'] = pd.to_datetime(hidden_data['cltdob_fix'], errors='coerce').dt.year


    # Identify categorical variables
    categorical_columns = hidden_data.select_dtypes(include=["object"]).columns

    # Convert using LabelEncoder
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()

    for col in categorical_columns:
        hidden_data[col] = label_encoder.fit_transform(hidden_data[col])
    
    hidden_data.interpolate()

    hidden_data = hidden_data.fillna(hidden_data.mean())

    low_var_columns = []

    for c in hidden_data.columns:
        if len(hidden_data[c].unique())==1:
            low_var_columns += [c]

    hidden_data = hidden_data.drop(columns=low_var_columns)

    from sklearn.tree import DecisionTreeClassifier
    import matplotlib.pyplot as plt

    decision_tree_model = DecisionTreeClassifier()
    decision_tree_model.fit(hidden_data, target_column)

    importance_scores = decision_tree_model.feature_importances_

    importance_scores_features = pd.DataFrame({'Feature': hidden_data.columns, 'Importance': importance_scores})
    importance_scores_features = importance_scores_features.sort_values(by = 'Importance', ascending =False)

    top50_features = importance_scores_features.head(50)

    from sklearn.model_selection import train_test_split
    features_top50 = hidden_data[top50_features["Feature"].tolist()]
    X_train, X_val, y_train, y_val = train_test_split(features_top50, target_column, test_size=0.2, random_state=0)

    tm = train_model(X_train, y_train)

    result = [tm.predict(X_val)] 

    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
target_column = test_df["f_purchase_lh"].fillna(0)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!