In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.layers import LSTM,Input,Conv1D, SpatialDropout1D,Dense, BatchNormalization, Activation, MaxPooling1D, GlobalAveragePooling1D, Add,Flatten,Dropout
from tensorflow.keras import Model
from tensorflow.keras.callbacks import LearningRateScheduler,EarlyStopping
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

#### Load data

In [55]:
train_path = "../train.csv"
test_path = "../test.csv"
augment_path = "../Q6_output/Q6_generated.csv"
glove_path = '../glove-global-vectors-for-word-representation/glove.6B.100d.txt'

In [3]:
train = pd.read_csv(train_path) #load train

In [4]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:]).astype(float)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove = load_glove_model(glove_path) #glove dictionary

Loading Glove Model
400000 words loaded!


#### Create Word index dictionary

In [5]:
words = list(glove.keys())
word_dic = dict(zip(words,range(len(words))))
# reverse the glove
glove = dict(zip(range(len(words)),list(glove.values())))

#### Split data first

In [6]:
X = train.iloc[:,3]
y = train.iloc[:,4]

#### Clean text and represent using matrix

In [7]:
def map_text(text):
    """
    clean punctuation and map text
    """
    text = text.lower()
    text = "".join(l for l in text if l not in string.punctuation)
    new_text = []
    for i in text.split():
        try:
            new_text.append(word_dic[i])
        except:
            continue
    return new_text

def map_index_to_100d(text):
    ans = []
    for t in text:
        if t == -1:
            tmp = np.zeros((1,100))
        else:
            tmp = glove[t].reshape(1,100)
        if ans == []:
            ans = tmp
        else:
            ans = np.append(ans,tmp,axis=0)
    return ans

# map text to vector
X = X.apply(map_text)

#pad sequence
X = pad_sequences(X,padding='post', maxlen=100,value = -1)

new_X = []
for i in range(X.shape[0]):
    new_X.append(map_index_to_100d(X[i]))
X = np.array(new_X)
print(X.shape)
# train['text'] = train['text'].map(map_index_to_100d)

  if ans == []:


(7613, 100, 100)


#### Split data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6090, 100, 100) (1523, 100, 100) (6090,) (1523,)


In [9]:
### Here we append the augmented text to trainning data and create another pair of training
x = pd.read_csv(augment_path)['text']
x = x.apply(map_text)
x = pad_sequences(x,padding='post', maxlen=100,value = -1)
new_x = []
for i in range(x.shape[0]):
    new_x.append(map_index_to_100d(x[i]))
x = np.array(new_x)
print(x.shape)

X_train_aug = np.append(X_train,x,axis = 0)
y_train_aug = np.append(y_train,np.ones((len(x),)),axis = 0)
print(X_train_aug.shape, y_train_aug.shape)

  if ans == []:


(846, 100, 100)
(6936, 100, 100) (6936,)


#### Build Text Model

In [45]:
def get_model():
    model = Sequential()
    model.add(Input((100,100)))
    model.add(SpatialDropout1D(0.15))

    model.add(Conv1D(32,kernel_size=5,activation='relu'))
    model.add(MaxPooling1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.25))

    model.add(Conv1D(32,kernel_size=5,activation='relu'))
    model.add(MaxPooling1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.25))

    model.add(Conv1D(64,kernel_size=5,activation='relu'))
    model.add(MaxPooling1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.25))

    model.add(GlobalAveragePooling1D())
   
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [46]:
def step_decay(epoch):
    """
    a function to decay the learning rate 0.94 every 2 epoch
    """
    initial_lrate = 0.003
    drop = 0.95
    epochs_drop = 2
    end_lr = 0.00001
    lrate = initial_lrate * np.power(drop,  
        np.floor((1+epoch)/epochs_drop))
    if lrate > end_lr:
        return lrate
    else:
        return end_lr
lr_scheduler = LearningRateScheduler(step_decay)
el = EarlyStopping(monitor='val_loss', patience=8)

#parameter setting
epochs = 50
batch_size = 128
optimizer = Adam(0.003)

#model compile
model = get_model()
model.compile(optimizer,loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
spatial_dropout1d_10 (Spatia (None, 100, 100)          0         
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 96, 32)            16032     
_________________________________________________________________
max_pooling1d_30 (MaxPooling (None, 48, 32)            0         
_________________________________________________________________
batch_normalization_34 (Batc (None, 48, 32)            128       
_________________________________________________________________
dropout_38 (Dropout)         (None, 48, 32)            0         
_________________________________________________________________
conv1d_31 (Conv1D)           (None, 44, 32)            5152      
_________________________________________________________________
max_pooling1d_31 (MaxPooling (None, 22, 32)          

In [47]:
model.fit(X_train,y_train,epochs = epochs, batch_size = batch_size,validation_data=(X_test,y_test),callbacks=[lr_scheduler,el],verbose = 2)

Epoch 1/50
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
48/48 - 1s - loss: 0.5747 - accuracy: 0.7026 - val_loss: 0.5532 - val_accuracy: 0.7630
Epoch 2/50
48/48 - 1s - loss: 0.4825 - accuracy: 0.7787 - val_loss: 0.5184 - val_accuracy: 0.7663
Epoch 3/50
48/48 - 1s - loss: 0.4476 - accuracy: 0.8011 - val_loss: 0.5956 - val_accuracy: 0.6980
Epoch 4/50
48/48 - 1s - loss: 0.4261 - accuracy: 0.8163 - val_loss: 0.4721 - val_accuracy: 0.7840
Epoch 5/50
48/48 - 1s - loss: 0.4153 - accuracy: 0.8199 - val_loss: 0.4730 - val_accuracy: 0.7833
Epoch 6/50
48/48 - 1s - loss: 0.3961 - accuracy: 0.8238 - val_loss: 0.4454 - val_accuracy: 0.8050
Epoch 7/50
48/48 - 1s - loss: 0.3923 - accuracy: 0.8332 - val_loss: 0.4383 - val_accuracy: 0.8004
Epoch 8/50
48/48 - 1s - loss: 0.3754 - accuracy: 0.8407 - val_loss: 0.4590 - val_accuracy: 0.7853
Epoc

<tensorflow.python.keras.callbacks.History at 0x388e31190>

In [48]:
y_pred = model.predict(X_test)
threshold = 0.5
y_pred = (y_pred >= 0.5).astype(int)
print(classification_report(y_test, y_pred))
print(f"Test set AUC score: {roc_auc_score(y_test, y_pred)}")

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       874
           1       0.84      0.65      0.73       649

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.79      1523
weighted avg       0.81      0.80      0.79      1523

Test set AUC score: 0.7804931367744073


### Use Augmentation data

In [49]:
model_aug = get_model()
model_aug.compile(optimizer,loss='binary_crossentropy', metrics=['accuracy'])

model_aug.fit(X_train_aug,y_train_aug,epochs = epochs, batch_size = batch_size,validation_data=(X_test,y_test),callbacks=[lr_scheduler,el],verbose = 2)

Epoch 1/50
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
55/55 - 1s - loss: 0.5316 - accuracy: 0.7351 - val_loss: 0.4640 - val_accuracy: 0.8109
Epoch 2/50
55/55 - 1s - loss: 0.4555 - accuracy: 0.7917 - val_loss: 0.4697 - val_accuracy: 0.7951
Epoch 3/50
55/55 - 1s - loss: 0.4266 - accuracy: 0.8123 - val_loss: 0.4555 - val_accuracy: 0.7879
Epoch 4/50
55/55 - 1s - loss: 0.4081 - accuracy: 0.8222 - val_loss: 0.4208 - val_accuracy: 0.8181
Epoch 5/50
55/55 - 1s - loss: 0.3875 - accuracy: 0.8322 - val_loss: 0.4291 - val_accuracy: 0.8168
Epoch 6/50
55/55 - 1s - loss: 0.3697 - accuracy: 0.8411 - val_loss: 0.4317 - val_accuracy: 0.8155
Epoch 7/50
55/55 - 1s - loss: 0.3553 - accuracy: 0.8467 - val_loss: 0.4367 - val_accuracy: 0.8102
Epoch 8/50
55/55 - 1s - loss: 0.3478 - accuracy: 0.8534 - val_loss: 0.4506 - val_accuracy: 0.8070
Epoc

<tensorflow.python.keras.callbacks.History at 0x388eeb460>

In [50]:
y_pred = model_aug.predict(X_test)
threshold = 0.5
y_pred = (y_pred >= 0.5).astype(int)
print(classification_report(y_test, y_pred))
print(f"Test set AUC score: {roc_auc_score(y_test, y_pred)}")

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
              precision    recall  f1-score   support

           0       0.79      0.91      0.85       874
           1       0.84      0.68      0.75       649

    accuracy                           0.81      1523
   macro avg       0.82      0.79      0.80      1523
weighted avg       0.81      0.81      0.81      1523

Test set AUC score: 0.7926443780785789


#### Use Full data Train

In [53]:
#### No Augmentated
model = get_model()
model.compile(optimizer,loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X,y,epochs = epochs, batch_size = batch_size,validation_data=(X_test,y_test),callbacks=[lr_scheduler,el],verbose = 0)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


<tensorflow.python.keras.callbacks.History at 0x39ba5e070>

In [54]:
X_aug = np.append(X,x,axis = 0)
y_aug = np.append(y,np.ones((len(x),)),axis = 0)

#### Augmentated
model_aug = get_model()
model_aug.compile(optimizer,loss='binary_crossentropy', metrics=['accuracy'])
model_aug.fit(X_aug,y_aug,epochs = epochs, batch_size = batch_size,validation_data=(X_test,y_test),callbacks=[lr_scheduler,el],verbose = 0)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


<tensorflow.python.keras.callbacks.History at 0x39894c520>

#### Output Prediction File

In [57]:
## Prepare Test data
test = pd.read_csv(test_path) #load train
X_test = test.iloc[:,3]

# map text to vector
X_test = X_test.apply(map_text)

#pad sequence
X_test = pad_sequences(X_test,padding='post', maxlen=100,value = -1)

new_X = []
for i in range(X_test.shape[0]):
    new_X.append(map_index_to_100d(X_test[i]))
X_test = np.array(new_X)
print(X_test.shape)

  if ans == []:


(3263, 100, 100)


In [61]:
pred = (model.predict(X_test) >= threshold).astype(int)
pred_aug = (model_aug.predict(X_test) >= threshold).astype(int)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


In [72]:
pd.DataFrame({'id':test['id'],'target':pred.reshape(1,len(pred))[0]}).to_csv('../Q6_output/Q6_results_plain.csv',index=None)
pd.DataFrame({'id':test['id'],'target':pred_aug.reshape(1,len(pred_aug))[0]}).to_csv('../Q6_output/Q6_results_augmented.csv',index=None)