### Models Included
- SVM
- Bi-LSTM
- BiLSTM with Attention
- LSTM CNN

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, SimpleRNN, Flatten,\
Activation, RepeatVector, Permute, Concatenate, Lambda
#from tensorflow_addons.layers import CRF  # Assuming you're using TensorFlow Addons for CRF
from tensorflow.keras import optimizers
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import random
import subprocess
from sklearn.utils import class_weight
import tensorflow.keras.backend as K
from tensorflow.keras import Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Multiply
from tensorflow.keras.layers import Conv1D, MaxPooling1D




In [2]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Move two levels up
grandparent_directory = os.path.abspath(os.path.join(current_directory, "..", "..", ".."))

# Specify the folder name
folder_name = "data"

# Combine the grandparent directory with the folder name
folder_path = os.path.join(grandparent_directory, folder_name)

# List files in the specified folder
files = os.listdir(folder_path)

# Select the "Task2.csv" file
task2_file = "Task2.csv"

if task2_file in files:
    print(f"The '{task2_file}' file is present in the '{folder_name}' folder.")
else:
    print(f"The '{task2_file}' file is not found in the '{folder_name}' folder.")

file_path=os.path.join(folder_path,task2_file)

print(f"The file path is {file_path}")


The 'Task2.csv' file is present in the 'data' folder.
The file path is c:\Users\awtfh\OneDrive - rwth-aachen.de\Desktop\Inclusion_Exclusion_Phrase_Mining-main\data\Task2.csv


In [3]:
data = pd.read_csv(file_path)
data = data.rename(columns={'id':'Sentence #'})
data = data.dropna(axis=0)
#data = data.drop('Unnamed: 0',axis=1)

In [4]:
data.groupby('type').count()

Unnamed: 0_level_0,sentence_id,sentence,category
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
E,1129,1129,1129
I,1174,1174,1174


In [5]:
data

Unnamed: 0,sentence_id,sentence,category,type
0,0,late afternoon a very hot day which took some ...,time/day/month,E
1,1,concessions were overpriced,price,E
2,2,queues were minimal,queues,I
3,3,of truly healthy options for dining,food,E
4,4,park entry fee is reasonable,price,I
...,...,...,...,...
2298,2151,in the rain,time/day/month,E
2299,2152,without feeling pushed,crowd,I
2300,2152,claustrophobic,claustrophobic,E
2301,2152,hordes of crowds,crowd,E


In [6]:
sentences = []
labels = list(data['type']) 
typs = data['type'].values #Spelling error, there was it was types not type
x = data.apply(lambda l: sentences.append(l['sentence'].split(' ')),axis=1)

In [7]:
words = []
for sent in sentences:
    for wrd in sent:
        words.append(wrd)
words = list(set(words))
tags = list(set(labels))

n_words = len(words)
n_tags = len(tags)

In [8]:
len(typs)

2303

In [9]:
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
max_len = 30
X = [[word2idx[w] for w in s] for s in sentences]    
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)
y = [tag2idx[tg] for tg in labels]
# y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
y = np.array([to_categorical(i, num_classes=n_tags) for i in y])
y = [[y[i],typs[i]] for i in range(len(y))]
random.seed(33333333)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

y_train = np.array([lab[0] for lab in y_train])
typ_test = [lab[1] for lab in y_test]
y_test = np.array([lab[0] for lab  in y_test])

In [10]:
# def loadGloveModel(File):
#     print("Loading Glove Model")
#     f = open(File,'r')
#     gloveModel = {}
#     for line in f:
#         splitLines = line.split()
#         word = splitLines[0]
#         wordEmbedding = np.array([float(value) for value in splitLines[1:]])
#         gloveModel[word] = wordEmbedding
#     print(len(gloveModel)," words loaded!")
#     return gloveModel

# vec_model = loadGloveModel('glove/glove.6B.200d.txt')

In [11]:
def loadGloveModel(File):
    with open(File, 'r', encoding='utf-8') as f:
        gloveModel = {}
        for line in f:
            splitLines = line.split()
            word = splitLines[0]
            wordEmbedding = np.array([float(val) for val in splitLines[1:]])
            gloveModel[word] = wordEmbedding
    print(len(gloveModel), "words loaded!")
    return gloveModel

vec_model = loadGloveModel('glove/glove.6B.200d.txt')


400000 words loaded!


In [12]:
emb_dim = len(vec_model['the'])
embedding_matrix = np.zeros((len(word2idx) + 1, emb_dim))

for word, i in word2idx.items():
    if word not in vec_model:
        continue
    embedding_vector = vec_model[word]
    embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

(1894, 200)


In [13]:
def full_results(y_test,pred):
    
    report = classification_report(y_test,pred.flatten('F'),output_dict=True)
    df = pd.DataFrame(report).transpose()
    display(df)
    
    inc_test = []
    exc_test = []
    inc_pred = []
    exc_pred = []
    for i,t in enumerate(typ_test):
        if t == 'I':
            inc_test.append(y_test[i])
            inc_pred.append(pred[i])
        else:
            exc_test.append(y_test[i])
            exc_pred.append(pred[i])
    print("For inclusions")
    report = classification_report(inc_test,inc_pred,output_dict=True)
    df = pd.DataFrame(report).transpose()
    display(df)
    print("For exclusions")
    report = classification_report(exc_test,exc_pred,output_dict=True)
    df = pd.DataFrame(report).transpose()
    display(df)
    return

def return_report(model,epochs):
    y_flat = list(np.argmax(y_test,1).flatten('F'))
#     class_weights = class_weight.compute_class_weight('balanced',y_flat)
#     class_weights = [1,10,20,10,20]
    model.fit(X_train,y_train,epochs=epochs,verbose=1)
    out = model.predict(X_test)
    pred = np.argmax(out,1)
    full_results(y_flat,pred)
    return pred


In [14]:
def get_bilstm_lstm_model():
    
    input = Input(shape=(max_len,))

    # Add Embedding layer
    model = Embedding(input_dim=n_words + 1, output_dim=emb_dim,
                  input_length=max_len, weights=[embedding_matrix],trainable=False)(input)

    # Add bidirectional LSTM
    model = Bidirectional(LSTM(units=emb_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.1))(model)    
    
    model = TimeDistributed(Dense(100,activation='relu'))(model)
    model = Flatten()(model)
    model = Dense(100,activation='relu')(model)
    # Add timeDistributed Layer
    out = Dense(n_tags, activation="softmax")(model)

    #Optimiser 
    adam = Adam(lr=0.0007)

    # Compile model
    model = Model(input, out)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    model.summary()
    
    return model

def lstm_attention():
    
    input = Input(shape=(max_len,))

    # Add Embedding layer
    model = Embedding(input_dim=n_words + 1, output_dim=emb_dim,
                  input_length=max_len, weights=[embedding_matrix],trainable=False)(input)

    # Add bidirectional LSTM
    model = Bidirectional(LSTM(units=emb_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.1))(model)

    attention = TimeDistributed(Dense(1, activation='tanh'))(model) 
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(2*emb_dim)(attention)
    attention = Permute([2, 1])(attention)

    # apply the attention
    sent_representation = Multiply()([model, attention])
    sent_representation = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)
    probabilities = Dense(3, activation='softmax')(sent_representation)
    
    # Add timeDistributed Layer
    out = Dense(n_tags, activation="softmax")(sent_representation)

    #Optimiser 
    adam = Adam(lr=0.0007)

    # Compile model
    model = Model(input, out)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    model.summary()
    
    return model

def lstm_cnn():
    input = Input(shape=(max_len,))

    # Add Embedding layer
    model = Embedding(input_dim=n_words + 1, output_dim=emb_dim,
                  input_length=max_len, weights=[embedding_matrix],trainable=False)(input)
    model = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(model)
    model = MaxPooling1D(pool_size=2)(model)
    model = LSTM(100,dropout=0.2,recurrent_dropout=0.2)(model)
    out = Dense(n_tags, activation='softmax')(model)
    model = Model(input,out)
    adam = Adam(lr=0.0007)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    print(model.summary())
    
    return model


In [15]:
# from keras.layers import InputLayer

# def get_bilstm_lstm_model():
    
#     input_layer = Input(shape=(max_len,))

#     # Add Embedding layer
#     model = Embedding(input_dim=n_words + 1, output_dim=emb_dim,
#                   input_length=max_len, weights=[embedding_matrix], trainable=False)(input_layer)

#     # Add bidirectional LSTM
#     model = Bidirectional(LSTM(units=emb_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.1))(model)    
    
#     model = TimeDistributed(Dense(100, activation='relu'))(model)
#     model = Flatten()(model)
#     model = Dense(100, activation='relu')(model)
#     # Add timeDistributed Layer
#     out = Dense(n_tags, activation="softmax")(model)

#     # Optimizer 
#     adam = ko.Adam(lr=0.0007)

#     # Compile model
#     model = Model(input_layer, out)
#     model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
#     model.summary()
    
#     return model



In [16]:
bilistm = get_bilstm_lstm_model()
pred = return_report(bilistm,25)






Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 30)]              0         
                                                                 
 embedding (Embedding)       (None, 30, 200)           378800    
                                                                 
 bidirectional (Bidirection  (None, 30, 400)           641600    
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 30, 100)           40100     
 ributed)                                                        
                                                                 
 flatten (Flatten)           (None, 3000)              0         
                                                                 
 dense_1 (Dense)             (None, 100)               300100










Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


Unnamed: 0,precision,recall,f1-score,support
0,0.847458,0.884956,0.865801,226.0
1,0.884444,0.846809,0.865217,235.0
accuracy,0.86551,0.86551,0.86551,0.86551
macro avg,0.865951,0.865882,0.865509,461.0
weighted avg,0.866312,0.86551,0.865503,461.0


For inclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,1.0,0.846809,0.917051,235.0
accuracy,0.846809,0.846809,0.846809,0.846809
macro avg,0.5,0.423404,0.458525,235.0
weighted avg,1.0,0.846809,0.917051,235.0


For exclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.884956,0.938967,226.0
1,0.0,0.0,0.0,0.0
accuracy,0.884956,0.884956,0.884956,0.884956
macro avg,0.5,0.442478,0.469484,226.0
weighted avg,1.0,0.884956,0.938967,226.0


In [17]:
attn = lstm_attention()
preds = return_report(attn,50)



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 30)]                 0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 30, 200)              378800    ['input_2[0][0]']             
                                                                                                  
 bidirectional_1 (Bidirecti  (None, 30, 400)              641600    ['embedding_1[0][0]']         
 onal)                                                                                            
                                                                                                  
 time_distributed_1 (TimeDi  (None, 30, 1)                401       ['bidirectional_1[0][0]'

Unnamed: 0,precision,recall,f1-score,support
0,0.883598,0.738938,0.804819,226.0
1,0.783088,0.906383,0.840237,235.0
accuracy,0.824295,0.824295,0.824295,0.824295
macro avg,0.833343,0.822661,0.822528,461.0
weighted avg,0.832362,0.824295,0.822874,461.0


For inclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,1.0,0.906383,0.950893,235.0
accuracy,0.906383,0.906383,0.906383,0.906383
macro avg,0.5,0.453191,0.475446,235.0
weighted avg,1.0,0.906383,0.950893,235.0


For exclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.738938,0.849873,226.0
1,0.0,0.0,0.0,0.0
accuracy,0.738938,0.738938,0.738938,0.738938
macro avg,0.5,0.369469,0.424936,226.0
weighted avg,1.0,0.738938,0.849873,226.0


In [18]:
lstmcnn = lstm_cnn()
preds = return_report(lstmcnn,50)







Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 30)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 30, 200)           378800    
                                                                 
 conv1d (Conv1D)             (None, 30, 32)            19232     
                                                                 
 max_pooling1d (MaxPooling1  (None, 15, 32)            0         
 D)                                                              
                                                                 
 lstm_2 (LSTM)               (None, 100)               53200     
                                                                 
 dense_6 (Dense)             (None, 2)                 202       
                                                           

Unnamed: 0,precision,recall,f1-score,support
0,0.842553,0.876106,0.859002,226.0
1,0.876106,0.842553,0.859002,235.0
accuracy,0.859002,0.859002,0.859002,0.859002
macro avg,0.85933,0.85933,0.859002,461.0
weighted avg,0.859657,0.859002,0.859002,461.0


For inclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,1.0,0.842553,0.91455,235.0
accuracy,0.842553,0.842553,0.842553,0.842553
macro avg,0.5,0.421277,0.457275,235.0
weighted avg,1.0,0.842553,0.91455,235.0


For exclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.876106,0.933962,226.0
1,0.0,0.0,0.0,0.0
accuracy,0.876106,0.876106,0.876106,0.876106
macro avg,0.5,0.438053,0.466981,226.0
weighted avg,1.0,0.876106,0.933962,226.0


In [None]:
bilistm.save('task2_models/bi-lstm/model')

INFO:tensorflow:Assets written to: task2_models/bi-lstm/model\assets


INFO:tensorflow:Assets written to: task2_models/bi-lstm/model\assets


### Simple Classifiers

In [None]:
tag2idx = {t: i for i, t in enumerate(tags)}
max_len = 30
emb_dim = len(vec_model['the'])
X = []
y = []

for sent in sentences:
    vec = np.zeros((30,emb_dim))
    for i, word in enumerate(sent):
        if word in vec_model:
            vec[i,:] = vec_model[word]
    X.append(vec)
X = np.array(X)
y = np.array([tag2idx[tg] for tg in labels])
y = [[y[i],typs[i]] for i in range(len(y))]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = np.array([lab[0] for lab in y_train])
typ_test = [lab[1] for lab in y_test]
y_test = np.array([lab[0] for lab  in y_test])
X_train = X_train.reshape((X_train.shape[0],max_len*emb_dim))
X_test = X_test.reshape((X_test.shape[0],max_len*emb_dim))

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC(kernel='poly')
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
full_results(y_test,pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier