In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import tensorflow as tf

In [None]:
# Use pd.read_csv() to read the CSV file into a DataFrame.
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data = train_data.set_index('Id')
test_data = test_data.set_index('Id')

In [None]:
# Visualize the top elements of features
train_data.head()

In [None]:
nulls = train_data.isnull().sum()
train_data.keys()


# 1: EDA

In [None]:
# Identify the datatype we are dealing with
train_data.dtypes

In [None]:
# Visualise the distribution of applicants' risk ratings in the full dataset.
sns.histplot(train_data['Response'], bins=sorted(train_data['Response'].unique()))
plt.xlabel('Response')
plt.ylabel('# of Applicants')
plt.title('Response Distribution')

print(train_data['Response'].unique())

| Variable             | Description                                                                                      |
|----------------------|--------------------------------------------------------------------------------------------------|
| Id                   | A unique identifier associated with an application.                                               |
| Product_Info_1-7     | A set of normalized variables relating to the product applied for.                                |
| Ins_Age              | Normalized age of the applicant.                                                                 |
| Ht                   | Normalized height of the applicant.                                                              |
| Wt                   | Normalized weight of the applicant.                                                              |
| BMI                  | Normalized BMI of the applicant.                                                                 |
| Employment_Info_1-6  | A set of normalized variables relating to the employment history of the applicant.               |
| InsuredInfo_1-6      | A set of normalized variables providing information about the applicant.                            |
| Insurance_History_1-9| A set of normalized variables relating to the insurance history of the applicant.                 |
| Family_Hist_1-5      | A set of normalized variables relating to the family history of the applicant.                    |
| Medical_History_1-41 | A set of normalized variables relating to the medical history of the applicant.                   |
| Medical_Keyword_1-48 | A set of dummy variables relating to the presence/absence of a medical keyword associated with the application. |
| Response             | This is the target variable, an ordinal variable relating to the final decision associated with an application. |


In [None]:
# Select only the columns with floating-point data (float64)
float_columns = train_data.select_dtypes(include=['float64'])

# Create a scatterplot matrix using Seaborn
sns.set(style="ticks")

# Calculate the correlation matrix
correlation_matrix = train_data[float_columns.columns].corr()
mask = np.zeros_like(correlation_matrix)

# Create a heatmap of the correlation matrix
plt.figure(figsize=(8, 6))  # Set the figure size
sns.heatmap(correlation_matrix, annot=False, cmap='RdBu_r', fmt=".2f", square=True)

# sns.pairplot(X_train[float_columns.columns], kind="scatter", diag_kind="kde")

# Show the plot
plt.show()

## Split into train, test, valid datasets

In [None]:
# Assign the features to their own dataframe.
X = train_data.drop(['Response'], axis=1)

# Assign the target variable to its own dataframe.
y = train_data.Response

# Perform a train-test split to obtain the training, validation and test data as separate dataframes.
from sklearn.model_selection import train_test_split

# Split out test/holdout set from full dataset.
# We will set the size of the X/y test datasets to be 20% of the original (full) X/y datasets, via the train_size/test_size parameters.
X_rem, X_test, y_rem, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0, stratify=y)

# Split remaining portion into training/validation sets.
# We will set the size of the X/y train datasets to be 60% of the original (full) X/y datasets, via the train_size/test_size parameters.
X_train, X_valid, y_train, y_valid = train_test_split(X_rem, y_rem, train_size=0.75, test_size=0.25, random_state=0, stratify=y_rem)

## Handle missing values 

In [None]:
# Determine which columns contain nulls/missing values.
X_train_cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Summarise how many missing values are present in each column.
X_train[X_train_cols_with_missing].isna().sum()

In [None]:
# Calculate the proportion of zeroes relative to non-zero values.
for col in X_train_cols_with_missing:
    sum = X_train[col].isna().sum()
    length = len(X_train[col].index)
    ratio = sum/length
    print('Proportion of zeroes in', col, 'is: ', round(ratio*100,2), '%.')

In [None]:
# These columns have been selected as they contain a high proportion of blanks/missing values (deemed here as >40%) in the TRAINING dataset.
cols_to_delete_due_to_missing_data = [ 'Family_Hist_3', 'Family_Hist_5',
                                      'Medical_History_10', 'Medical_History_15', 'Medical_History_24', 'Medical_History_32']

# Delete columns from ALL datasets where the proportion of zeroes in the TRAINING dataset exceeds a stipulated threshold.
X_train = X_train.drop(cols_to_delete_due_to_missing_data, axis=1)
X_valid = X_valid.drop(cols_to_delete_due_to_missing_data, axis=1)
X_test = X_test.drop(cols_to_delete_due_to_missing_data, axis=1)

In [None]:
# Impute missing values
cols_to_fill = ['Employment_Info_1','Employment_Info_4','Employment_Info_6','Insurance_History_5',
                'Family_Hist_2', 'Family_Hist_4','Medical_History_1']
X_train_copy = X_train.copy()
X_valid_copy = X_valid.copy()
X_test_copy = X_test.copy()

# Fill the null elements with median values
X_train_copy[cols_to_fill].fillna(X_train_copy[cols_to_fill].median(), inplace=True)
X_valid_copy[cols_to_fill].fillna(X_valid_copy[cols_to_fill].median(), inplace=True)
X_test_copy[cols_to_fill].fillna(X_test_copy[cols_to_fill].median(), inplace=True)


X_train_copy

In [None]:
# Plot the distributions kde pre and post filling values
sns.pairplot(X_train_copy[cols_to_fill], diag_kind="kde", diag_kws={'lw':2})

# sns.pairplot(X_train_copy[cols_to_fill], kind="scatter", diag_kind="kde")

plt.show()

In [None]:
sns.pairplot(X_train[cols_to_fill], kind="scatter", diag_kind="kde")

plt.show()

# Handle categorical columns with one-hot encoding

In [99]:
# String columns
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

one_hot_encoded_train = encoder.fit_transform(X_train_copy[['Product_Info_2']])
# Create a new DataFrame with one-hot encoded data
one_hot_train = pd.DataFrame(one_hot_encoded_train, columns=encoder.get_feature_names_out(['Product_Info_2']))
one_hot_encoded_test = encoder.fit_transform(X_test_copy[['Product_Info_2']])
# Create a new DataFrame with one-hot encoded data
one_hot_test = pd.DataFrame(one_hot_encoded_test, columns=encoder.get_feature_names_out(['Product_Info_2']))
one_hot_encoded_valid = encoder.fit_transform(X_valid_copy[['Product_Info_2']])
# Create a new DataFrame with one-hot encoded data
one_hot_valid = pd.DataFrame(one_hot_encoded_valid, columns=encoder.get_feature_names_out(['Product_Info_2']))

# Reset indexes
X_train_copy = X_train_copy.reset_index(drop=True)
one_hot_train = one_hot_train.reset_index(drop=True)
X_test_copy = X_test_copy.reset_index(drop=True)
one_hot_test = one_hot_test.reset_index(drop=True)
X_valid_copy = X_valid_copy.reset_index(drop=True)
one_hot_valid = one_hot_valid.reset_index(drop=True)


# Concatenate the new DataFrame with the original D"ataFrame
X_train_enc = pd.concat([X_train_copy, one_hot_train], axis=1)
X_train_enc = X_train_enc.drop('Product_Info_2',axis=1)
X_test_enc = pd.concat([X_test_copy, one_hot_test], axis=1)
X_test_enc = X_test_enc.drop('Product_Info_2',axis=1)
X_valid_enc = pd.concat([X_valid_copy, one_hot_valid], axis=1)
X_valid_enc = X_valid_enc.drop('Product_Info_2',axis=1)


print(X_train_enc)

       Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
0                   1              26        0.076923               2   
1                   1              26        0.076923               2   
2                   1              26        0.076923               2   
3                   1              26        0.128205               2   
4                   1              26        0.128205               2   
...               ...             ...             ...             ...   
35623               1              26        0.487179               2   
35624               1              26        0.230769               2   
35625               1              26        0.230769               2   
35626               1              26        0.025641               2   
35627               1              26        0.076923               2   

       Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt       BMI  \
0                   3               1  0.4

### Rescale data
- We rescale the floating values in the data between -1 to 1.

In [100]:
# Rescale variables
from sklearn.preprocessing import MinMaxScaler  #rescale data
scaler = MinMaxScaler()
# Fit
scaler.fit(X_train_enc)
# Replace the original float columns with the scaled values
X_train_scale = pd.DataFrame(scaler.transform(X_train_enc),
                             index=X_train_enc.index,
                             columns=X_train_enc.columns)

X_valid_scale = pd.DataFrame(scaler.transform(X_valid_enc),
                             index=X_valid_enc.index,
                             columns=X_valid_enc.columns)

X_test_scale = pd.DataFrame(scaler.transform(X_test_enc),
                             index=X_test_enc.index,
                             columns=X_test_enc.columns)
X_train_scale

Unnamed: 0,Product_Info_1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,BMI,...,Product_Info_2_B2,Product_Info_2_C1,Product_Info_2_C2,Product_Info_2_C3,Product_Info_2_C4,Product_Info_2_D1,Product_Info_2_D2,Product_Info_2_D3,Product_Info_2_D4,Product_Info_2_E1
0,0.0,0.675676,0.076923,0.0,1.0,0.0,0.417910,0.66,0.222222,0.413117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.675676,0.076923,0.0,0.0,0.0,0.462687,0.58,0.188889,0.433988,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.675676,0.076923,0.0,1.0,0.0,0.432836,0.78,0.348889,0.496556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.675676,0.128205,0.0,1.0,0.0,0.626866,0.56,0.137778,0.350784,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.675676,0.128205,0.0,1.0,1.0,0.656716,0.76,0.313333,0.463106,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35623,0.0,0.675676,0.487179,0.0,0.0,0.0,0.208955,0.74,0.244444,0.374932,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35624,0.0,0.675676,0.230769,0.0,1.0,0.0,0.597015,0.80,0.422222,0.582750,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
35625,0.0,0.675676,0.230769,0.0,1.0,0.0,0.298507,0.58,0.111111,0.278322,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
35626,0.0,0.675676,0.025641,0.0,1.0,0.0,0.716418,0.62,0.204444,0.421464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 2: Create classifier model

In [None]:
# Create input data for model
train_data = train_data.fillna(0)
data = train_data.drop(['Id','Response','Product_Info_2'],axis=1)
target = train_data['Response']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size = 0.33
)

label_mapping = {8: 7}  # Map label 8 to 7
y_train= [label_mapping.get(label, label) for label in y_train]
y_test = [label_mapping.get(label, label) for label in y_test]


N,D = X_train.shape


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(D,)),  # Specify input dimension
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='softmax')  # 8 output units for 8 classes
])

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


In [None]:
target.dtype