##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [1]:
#%pip install pandas 
#%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [2]:
# Can have as many cells as you want for code
import pandas as pd
import numpy as np
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [3]:
###...code...###
df = pd.read_parquet(filepath)
#print(df.head())


#print(len(df['clntnum'].unique().tolist()))
# Number of unique client numbers is the same as number of rows in df, ie every row corresponds to a unique person's data, so no grouping required

df.isna().sum()

for c in df.columns:
    if len(df[c].unique()) == 1:
        #print(c, df[c].unique())
        df.drop(c, axis = 1, inplace = True)

df.drop('clntnum', axis = 1, inplace = True)

# As shown, race_desc, ctrycode_desc have some NA values. Since they are relatively small, we will try and impute those for race_desc, as we
# make the assumption that one's race, and thus culture, background, wil impact the way they view their needs and thus whether they need
# insurance.

# Since non-empty entries for ctrycode_desc take only one value, ie Singapore, we make the simplifying assumption that all persons in this
# database are in Singapore, which is not unrealistic since there is no reason to purchase insurance in Singapore if one is not in Singapore.

# Multiple flags for certain GI claims have no entries. As such, these columns will be dropped due to having no or negative impact 
# on further analysis. However, due to the intepretation of these columns, it is perhaps recommended to review the data collection process 
# for these claims, or the specifics of the lack of these claims by relevant experts, etc. to determine if there may be some other conclusion
# to be drawn.

# Similarly, many columns corresponding to specific insurance products, notably if they have ever been bought and the GI claim success rate, are
# either empty or 0. Intepretation is limited due to minimal knowledge of company processes, but the data appears to suggest some policies are
# redundant to the people or something else that may be of interest. These columns will be removed nonetheless.
        
# Many columns corresponding to APE, SUMIN, PREMPAID are also all 0. These correspond to monetary transcations between the company and clients.
# As the goal here is to predict customer satisfaction and changeover rates, the main value provided by these columns are potential correlation
# between prices and customer willingness to pay said price which relates to satisfaction and changeover. However, these are already captured 
# as part of the various flg_ ... _claim columns as we assume clients always consider these when deciding to buy the insurance. Thus they are
# redundant and all columns of form ape_, sumin_ and prempaid_ will be removed.

# We also remove clntnum since it is effectively an index and has no real impact.


df['f_purchase_lh'].fillna(value = 0, inplace = True)
#print(df.info())


In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
for c in df.isna():
    if df[c].isna().sum()/df.shape[0] > 0.4:
        df.drop(c, axis = 1, inplace = True)

y = df['f_purchase_lh']

x_numeric = df.select_dtypes(include = ["int64", "float64"]).columns
df[x_numeric] = df[x_numeric].apply(lambda x: x.fillna(x.median()))
df.drop('f_purchase_lh', axis = 1, inplace = True)

x_non_numeric = df.select_dtypes(exclude = ["int64", "float64"]).columns
#onc = OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = np.nan)
#onc.fit(df[x_non_numeric])
#trans = pd.DataFrame(pd.DataFrame(onc.transform(df[x_non_numeric])), copy)
#imputer = KNNImputer(n_neighbors = 3)
#trans = pd.DataFrame(imputer.fit_transform(pd.DataFrame(trans)))
#df2 = pd.concat([df, trans], axis = 1)

#df['f_purchase_lh'].fillna(value = 0, inplace = True)
df.drop(columns = x_non_numeric, inplace = True)


In [5]:
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=0)

print('Before:', Counter(y_train))
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print('After:', Counter(y_train))

Before: Counter({0.0: 13809, 1.0: 584})
After: Counter({1.0: 13809, 0.0: 13809})


In [40]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=0)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units = 50, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 40, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 30, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 20, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

model.fit(X_train, y_train, epochs = 30, batch_size = 32, verbose = 1)

<class 'pandas.core.frame.DataFrame'>
Index: 17992 entries, 19550 to 15795
Data columns (total 60 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   flg_substandard                    17992 non-null  float64
 1   flg_is_borderline_standard         17992 non-null  float64
 2   flg_is_revised_term                17992 non-null  float64
 3   flg_is_rental_flat                 17992 non-null  float64
 4   flg_has_health_claim               17992 non-null  float64
 5   flg_has_life_claim                 17992 non-null  float64
 6   flg_gi_claim                       17992 non-null  float64
 7   flg_is_proposal                    17992 non-null  float64
 8   flg_with_preauthorisation          17992 non-null  float64
 9   flg_is_returned_mail               17992 non-null  float64
 10  is_consent_to_mail                 17992 non-null  float64
 11  is_consent_to_email                17992 non-null  floa

<keras.src.callbacks.History at 0x1305e1da310>

In [42]:
_, accuracy = model.evaluate(X_val, y_val)
print(accuracy)
model.summary()

0.9602667689323425
Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_82 (Dense)            (None, 50)                3050      
                                                                 
 dense_83 (Dense)            (None, 40)                2040      
                                                                 
 dense_84 (Dense)            (None, 30)                1230      
                                                                 
 dense_85 (Dense)            (None, 20)                620       
                                                                 
 dense_86 (Dense)            (None, 1)                 21        
                                                                 
Total params: 6961 (27.19 KB)
Trainable params: 6961 (27.19 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [49]:

def load_model():
    new_model = tf.keras.models.load_model('my_model.keras')
    return new_model

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [46]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    list = ['flg_substandard', 'flg_is_borderline_standard', 'flg_is_revised_term',
       'flg_is_rental_flat', 'flg_has_health_claim', 'flg_has_life_claim',
       'flg_gi_claim', 'flg_is_proposal', 'flg_with_preauthorisation',
       'flg_is_returned_mail', 'is_consent_to_mail', 'is_consent_to_email',
       'is_consent_to_call', 'is_consent_to_sms', 'is_valid_dm',
       'is_valid_email', 'is_housewife_retiree', 'is_sg_pr', 'is_class_1_2',
       'is_dependent_in_at_least_1_policy', 'hh_size',
       'n_months_last_bought_products', 'flg_latest_being_lapse',
       'flg_latest_being_cancel', 'tot_inforce_pols', 'f_hold_839f8a',
       'f_hold_e22a6a', 'f_hold_c4bda5', 'f_hold_ltc', 'f_hold_507c37',
       'f_ever_bought_839f8a', 'f_ever_bought_e22a6a', 'f_ever_bought_c4bda5',
       'f_ever_bought_ltc', 'f_ever_bought_507c37', 'f_ever_bought_gi',
       'f_ever_bought_grp_6fc3e6', 'f_ever_bought_grp_de05ae',
       'f_ever_bought_grp_945b5a', 'f_ever_bought_grp_6a5788',
       'f_ever_bought_ltc_43b9d5', 'f_ever_bought_grp_9cdedf',
       'f_ever_bought_grp_1581d7', 'f_ever_bought_grp_22decf',
       'f_ever_bought_lh_507c37', 'f_ever_bought_lh_839f8a',
       'f_ever_bought_inv_e9f316', 'f_ever_bought_grp_caa6ff',
       'f_ever_bought_grp_fd3bfb', 'f_ever_bought_lh_e22a6a',
       'f_ever_bought_grp_70e1dd', 'f_ever_bought_grp_e04c3a',
       'f_ever_bought_grp_fe5fb8', 'f_ever_bought_grp_94baec',
       'f_ever_bought_grp_e91421', 'f_ever_bought_lh_f852af',
       'f_ever_bought_lh_947b15', 'f_elx', 'f_mindef_mha', 'f_retail']
    x = hidden_data[list]
    result = model.predict(x)
    return result

##### Cell to check testing_hidden_data function

In [47]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

[[0.08646835]
 [0.04748528]
 [0.04005768]
 ...
 [0.03947005]
 [0.02105278]
 [0.05515887]]


### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!