In [1]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense,Embedding,Flatten,concatenate,MaxPooling1D,Dropout,Input,Conv1D,BatchNormalization,Reshape,LSTM,LeakyReLU
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import datetime
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [2]:
#readding data
df=pd.read_csv("pre_processed.csv")
df['teacher_prefix'].fillna(value="Mrs.", inplace=True)
y = df['project_is_approved'].values 
x = df.drop(['project_is_approved'], axis=1)


In [3]:
#train-test split
x_train1,x_test,y_train1,y_test=train_test_split(x,y,test_size=0.2,stratify=y)
x_train,x_cv,y_train,y_cv=train_test_split(x_train1,y_train1,test_size=0.2,stratify=y_train1)

In [4]:

#allocating seperate variables for features in the data
xtr_essay= x_train["essay"].values
xtr_state= x_train["school_state"].values
xtr_grade= x_train["project_grade_category"].values
xtr_cat= x_train["project_subject_categories"].values
xtr_subcat= x_train["project_subject_subcategories"].values
xtr_Tprefix= x_train["teacher_prefix"].values
xtr_num= x_train[['teacher_number_of_previously_posted_projects', 'contains_digit', 'price','quantity']].copy().values

xte_essay= x_test["essay"].values
xte_state= x_test["school_state"].values
xte_grade= x_test["project_grade_category"].values
xte_cat= x_test["project_subject_categories"].values
xte_subcat= x_test["project_subject_subcategories"].values
xte_Tprefix= x_test["teacher_prefix"].values
xte_num= x_test[['teacher_number_of_previously_posted_projects', 'contains_digit', 'price','quantity']].copy().values

xcv_essay= x_cv["essay"].values
xcv_state= x_cv["school_state"].values
xcv_grade= x_cv["project_grade_category"].values
xcv_cat= x_cv["project_subject_categories"].values
xcv_subcat= x_cv["project_subject_subcategories"].values
xcv_Tprefix= x_cv["teacher_prefix"].values
xcv_num= x_cv[['teacher_number_of_previously_posted_projects', 'contains_digit', 'price','quantity']].copy().values

In [5]:
#normalising numerical values
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()

xtr_num=normalizer.fit_transform(xtr_num)
xte_num=normalizer.transform(xte_num)
xcv_num=normalizer.transform(xcv_num)

In [6]:
#tokenising categorical variables
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer.fit_on_texts(xtr_state)
xtr_state = tokenizer.texts_to_sequences(xtr_state)
xcv_state = tokenizer.texts_to_sequences(xcv_state)
xte_state = tokenizer.texts_to_sequences(xte_state)

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer.fit_on_texts(xtr_grade)
xtr_grade = tokenizer.texts_to_sequences(xtr_grade)
xcv_grade = tokenizer.texts_to_sequences(xcv_grade)
xte_grade = tokenizer.texts_to_sequences(xte_grade)

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n',oov_token = True)
tokenizer.fit_on_texts(xtr_cat)
xtr_cat = tokenizer.texts_to_sequences(xtr_cat)
xcv_cat = tokenizer.texts_to_sequences(xcv_cat)
xte_cat = tokenizer.texts_to_sequences(xte_cat)

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n',oov_token = True)
tokenizer.fit_on_texts(xtr_subcat)
xtr_subcat = tokenizer.texts_to_sequences(xtr_subcat)
xcv_subcat = tokenizer.texts_to_sequences(xcv_subcat)
xte_subcat = tokenizer.texts_to_sequences(xte_subcat)

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer.fit_on_texts(xtr_Tprefix)
xtr_Tprefix = tokenizer.texts_to_sequences(xtr_Tprefix)
xcv_Tprefix = tokenizer.texts_to_sequences(xcv_Tprefix)
xte_Tprefix = tokenizer.texts_to_sequences(xte_Tprefix)

In [7]:
#converting to numpy arrays
xtr_state=np.array(xtr_state)
xcv_state=np.array(xcv_state)
xte_state=np.array(xte_state)

xtr_grade=np.array(xtr_grade)
xcv_grade=np.array(xcv_grade)
xte_grade=np.array(xte_grade)

xtr_cat=np.array(xtr_cat)
xcv_cat=np.array(xcv_cat)
xte_cat=np.array(xte_cat)

xtr_subcat=np.array(xtr_subcat)
xcv_subcat=np.array(xcv_subcat)
#xcv_subcat = np.array(list(x for x in xcv_subcat))
xte_subcat=np.array(xte_subcat)
#xte_subcat = np.array(list(x for x in xte_subcat))

xtr_Tprefix=np.array(xtr_Tprefix)
xcv_Tprefix=np.array(xcv_Tprefix)
xte_Tprefix=np.array(xte_Tprefix)



In [8]:
#https://medium.com/@davidheffernan_99410/an-introduction-to-using-categorical-embeddings-ee686ed7e7f9
cat_vars = ["teacher_prefix","school_state","project_grade_category","project_subject_categories","project_subject_subcategories"]
cat_sizes = {}
cat_embsizes = {}
for cat in cat_vars:
    cat_sizes[cat] = x_train[cat].nunique()
    cat_embsizes[cat] = min(50, cat_sizes[cat]//2+1)

In [9]:
#tokenising essay feature
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer.fit_on_texts(xtr_essay)
xtr_essay = tokenizer.texts_to_sequences(xtr_essay)
xcv_essay = tokenizer.texts_to_sequences(xcv_essay)
xte_essay = tokenizer.texts_to_sequences(xte_essay)

xtr_essay = pad_sequences(xtr_essay, maxlen=300, padding='post')
xcv_essay=pad_sequences(xcv_essay, maxlen=300, padding='post')
xte_essay = pad_sequences(xte_essay, maxlen=300, padding='post')

In [10]:
#one hot encoding target variables
y_train=tf.keras.utils.to_categorical(y_train, 2)
y_cv=tf.keras.utils.to_categorical(y_cv, 2)
y_test=tf.keras.utils.to_categorical(y_test, 2) 

In [11]:
#LOADING PRETRAINIED GLOVE MODEL
embeddings_index = dict()
f = open('glove.6B.300d.txt',encoding="utf8")
for line in tqdm_notebook(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Loaded 400000 word vectors.


In [12]:
#CREATING EMBEDDED MATRIX
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tqdm_notebook(tokenizer.word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=47272.0), HTML(value='')))




In [13]:
#BUILDING MODEL ARCHITECTURE
ins = []
concat = []
tf.keras.backend.clear_session()
inp =  Input(shape=(300,))
ins.append(inp)
e=Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)(inp)
l=LSTM(150,kernel_initializer='glorot_normal',recurrent_dropout=0.4,activation='relu',return_sequences=True)(e)
f1=Flatten()(l)
concat.append(f1)

In [14]:
for cat in cat_vars:
    x = Input((1,), name=cat)
    ins.append(x)
    x = Embedding(cat_sizes[cat]+2, cat_embsizes[cat], input_length=1)(x)
    x = Flatten()(x)
    concat.append(x)

In [17]:
inp1= Input(shape=(4,),name='numerical')
ins.append(inp1)
d1 = Dense(128, activation='relu',kernel_initializer='glorot_normal')(inp1)
concat.append(d1)

In [18]:
reg=tf.keras.regularizers.l2(0.001)
x = concatenate(concat)
x=BatchNormalization()(x)
x= Dense(256,kernel_initializer='glorot_normal',activation='relu',kernel_regularizer=reg)(x)
x= Dropout(0.6)(x)
x= Dense(128,kernel_initializer='glorot_normal',activation='relu',kernel_regularizer=reg)(x)
x= Dropout(0.5)(x)
x= Dense(64,kernel_initializer='glorot_normal',activation='relu',kernel_regularizer=reg)(x)
x= Dense(32,kernel_initializer='glorot_normal',activation='relu',kernel_regularizer=reg)(x)
out= Dense(2,activation='softmax',kernel_initializer='glorot_normal',name='final')(x)




In [19]:
model = Model(inputs=ins, outputs=out)

In [20]:
#https://stackoverflow.com/a/51734992
from sklearn.metrics import roc_auc_score

def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [23]:
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=[auroc])

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 300, 300)     14181900    input_1[0][0]                    
__________________________________________________________________________________________________
teacher_prefix (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
school_state (InputLayer)       [(None, 1)]          0                                            
______________________________________________________________________________________________

In [25]:
filepath="./model_save/weights-{epoch:02d}.hdf5" 
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,patience=1, min_lr=0.002,verbose = 1)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_auc', verbose=1, mode='max')
log_dir=".\logs\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True)
model.fit([xtr_essay,xtr_Tprefix,xtr_state,xtr_grade,xtr_cat,xtr_subcat,xtr_num],y_train,epochs=12,         
batch_size=256,verbose=1,
validation_data=([xcv_essay,xcv_Tprefix,xcv_state,xcv_grade,xcv_cat,xcv_subcat,xcv_num],y_cv) ,callbacks=[tensorboard_callback,checkpoint,reduce_lr] )

Train on 69918 samples, validate on 17480 samples
Epoch 1/12
Epoch 00001: saving model to ./model_save/weights-01.hdf5
Epoch 2/12
Epoch 00002: saving model to ./model_save/weights-02.hdf5
Epoch 3/12
Epoch 00003: saving model to ./model_save/weights-03.hdf5
Epoch 4/12
Epoch 00004: saving model to ./model_save/weights-04.hdf5
Epoch 5/12
Epoch 00005: saving model to ./model_save/weights-05.hdf5
Epoch 6/12
Epoch 00006: saving model to ./model_save/weights-06.hdf5
Epoch 7/12
Epoch 00007: saving model to ./model_save/weights-07.hdf5
Epoch 8/12
Epoch 00008: saving model to ./model_save/weights-08.hdf5
Epoch 9/12
Epoch 00009: saving model to ./model_save/weights-09.hdf5
Epoch 10/12
Epoch 00010: saving model to ./model_save/weights-10.hdf5
Epoch 11/12
Epoch 00011: saving model to ./model_save/weights-11.hdf5
Epoch 12/12
Epoch 00012: saving model to ./model_save/weights-12.hdf5


<tensorflow.python.keras.callbacks.History at 0x219047cb808>

In [34]:
#https://www.tensorflow.org/api_docs/python/tf/keras/models/load_model

#loading best model 
c={"auroc":auroc}
model_tst_1=tf.keras.models.load_model("./model_save/weights-04.hdf5", custom_objects=c)

In [35]:
r=model_tst_1.evaluate([xte_essay,xte_Tprefix,xte_state,xte_grade,xte_cat,xte_subcat,xte_num],y_test,batch_size=256,verbose=0)

In [36]:
print("cross entropy of test data=",r[0])
print("auc-roc of test data=",r[1])

cross entropy of test data= 0.622900012183244
auc-roc of test data= 0.7442839


### Observations:

1. The auc-roc for test data is 0.744
2. After around the 5th epoch, model starts to overfit and it continues
3. The weights for thje final layer are between -0.5 to 0.5

![title](pics/model1_plot.png)


![title](pics/model1_hist.png)
