In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np


In [2]:
df= pd.read_pickle("train_test_cat.pkl.bz2",compression="bz2")

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
emp_title_cat,9,9,9,9,9
home_ownership_cat,4,4,4,4,4
loan_status_cat,2,2,2,2,2
job_cat,3,3,3,3,3
purpose_cat,1,1,1,1,1
application_type_cat,0,0,0,0,0
marital_cat,1,1,1,1,1
education_cat,0,0,0,0,0
mf_preference_cat,0,0,0,0,0
scheme_type_cat,38,34,38,36,34


In [4]:
df.drop(columns=["mf_id","scheme_cat_cat","risk_profile","scheme_sub_cat_cat","Risk"],axis=1,inplace=True)

In [5]:
df = df.reset_index(drop=True)

In [6]:
pd.set_option('use_inf_as_na', True)

In [7]:
mean=df["emp_length_val"].mean()

In [8]:
df["emp_length_val"].fillna(mean,inplace=True)

In [9]:
df.head().T

Unnamed: 0,0,1,2,3,4
emp_title_cat,9,9,9,9,9
home_ownership_cat,4,4,4,4,4
loan_status_cat,2,2,2,2,2
job_cat,3,3,3,3,3
purpose_cat,1,1,1,1,1
application_type_cat,0,0,0,0,0
marital_cat,1,1,1,1,1
education_cat,0,0,0,0,0
mf_preference_cat,0,0,0,0,0
scheme_type_cat,38,34,38,36,34


In [10]:
scheme_names=df["scheme_name"].nunique()
scheme_names

13882

In [11]:
df=df.sample(frac=0.3)

In [12]:
X=df.drop(columns="scheme_name",axis=1)
y=df["scheme_name"]

In [13]:
y = df["scheme_name"].astype("category").cat.codes

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [15]:
# Test ,Train split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7,test_size=0.3,random_state=100)

In [16]:
input_shape=X_train.shape

output_shape=y_train.shape

In [17]:
input_shape

(79180, 26)

In [18]:
output_shape

(79180,)

In [19]:
def one_hot(j):
    # input is the target dataset of shape (1, m) where m is the number of data points
    # returns a 2 dimensional array of shape (10, m) where each target value is converted to a one hot encoding
    # Look at the next block of code for a better understanding of one hot encoding
    n = j.shape[0]
    new_array = np.zeros((scheme_names, n))
    index = 0
    for res in j:
        new_array[res][index] = 1.0
        index = index + 1
    return new_array

In [20]:
y_train_enc=one_hot(y_train)
y_train_enc

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
y_train_enc.shape

(13882, 79180)

In [22]:
y_train =y_train_enc.T

In [23]:
X_train.shape

(79180, 26)

In [24]:
output=y_train.shape[1]
output

13882

In [25]:
# !pip install -q keras

In [26]:
from tensorflow import keras 

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation ,Flatten
from tensorflow.keras import regularizers

In [28]:
# create model
nn_model = Sequential()
nn_model.add(Dense(1024, input_dim=26, activation='relu'))
nn_model.add(Dropout(rate=0.3))
nn_model.add(BatchNormalization())
nn_model.add(Dense(512, activation = 'relu'))
nn_model.add(Dropout(rate=0.3))
nn_model.add(BatchNormalization())
nn_model.add(Dense(256, activation = 'relu'))
nn_model.add(Dropout(rate=0.3))
nn_model.add(BatchNormalization())
nn_model.add(Dense(128, activation = 'relu'))
nn_model.add(Dropout(rate=0.3))
nn_model.add(BatchNormalization())
nn_model.add(Dense(64, activation = 'relu'))
nn_model.add(Dropout(rate=0.3))
nn_model.add(BatchNormalization())
nn_model.add(Dense(32, activation = 'relu'))
nn_model.add(Dense(output, activation='softmax'))

[2021-10-11 17:03:47.291 tensorflow-2-3-gpu--ml-m5-24xlarge-8bbbe4ee58238535951f6abee41b:95 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-10-11 17:03:47.315 tensorflow-2-3-gpu--ml-m5-24xlarge-8bbbe4ee58238535951f6abee41b:95 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


In [29]:
nn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              27648     
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 512)               2048      
_________________________________________________________________
dense_2 (Dense)              (None, 256)               1

In [30]:
# !pip install tensorflow-gpu --upgrade

In [31]:
METRICS = [  
      keras.metrics.CategoricalAccuracy(name='cat_acc'),
      keras.metrics.TopKCategoricalAccuracy(k=5,name='top_k_acc'),
      keras.metrics.AUC(name='auc'),
]

In [32]:
nn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[METRICS])

In [None]:
nn_model.fit(X_train, y_train, epochs=500, batch_size=100)

Epoch 1/500