In [None]:
# Import statements
import numpy as np
import h5py
import pickle
import re
import scipy.io
import warnings
from copy import deepcopy
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras import backend as K
from keras.layers.core import Dense, Activation
from keras.layers import Embedding, LSTM
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import model_from_json
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [None]:
# Normalizer
VOWELS = ['a', 'e', 'i', 'o', 'u']

def removeRepeat(string):
    return re.sub(r'(.)\1+', r'\1\1', string)     

def removeVovels(string):
    return ''.join([l for l in string.lower() if l not in VOWELS])

def token(sentence, remove_vowels=False, remove_repeat=False, minchars=2):
    tokens = []
    for t in re.findall("[a-zA-Z]+",sentence.lower()):

        if len(t)>=minchars:
            if remove_vowels:
                t=removeVovels(t)
            if remove_repeat:
                t=removeRepeat(t)
            tokens.append(t)
    return tokens

In [None]:
# Filenames
Masterdir = '/content/drive/MyDrive/548/Sentimental Analysis/'
Datadir = 'Data/'
Modeldir = 'Models/'
Featuredir = 'Features/'
inputdatasetfilename = 'IIITH_Codemixed.txt'
experiment_details = 'lstm128_subword'
filename = 'match.txt'

In [None]:
# Data I/O formatting
SEPERATOR = '\t'
DATA_COLUMN = 1
LABEL_COLUMN = 3
LABELS = ['0','1','2'] # 0 -> Negative, 1-> Neutral, 2-> Positive
mapping_char2num = {}
mapping_num2char = {}
MAXLEN = 200

In [None]:
# LSTM Model Parameters
# Embedding
MAX_FEATURES = 0
embedding_size = 128
# Convolution
filter_length = 3
nb_filter = 128
pool_length = 3
# LSTM
lstm_output_size = 128
# Training
batch_size = 128
number_of_epochs = 50
numclasses = 3
test_size = 0.2

In [None]:
# Purpose -> Data I/O
# Input   -> Data file containing sentences and labels along with the global variables
# Output  -> Sentences cleaned up in list of lists format along with the labels as a numpy array
def parse(Masterdir,filename,seperator,datacol,labelcol,labels):
	# Reads the files and splits data into individual lines
	f=open(Masterdir+Datadir+filename,'r', encoding='utf-8')
	lines = f.read().lower()
	lines = lines.lower().split('\n')[:-1]

	X_train = []
	Y_train = []
	
	# Processes individual lines
	for line in lines:
		# Seperator for the current dataset. Currently '\t'. 
		line = line.split(seperator)
		# Token is the function which implements basic preprocessing as mentioned in our paper
		tokenized_lines = token(line[datacol])
		
		# Creates character lists
		char_list = []
		for words in tokenized_lines:
			for char in words:
				char_list.append(char)
			char_list.append(' ')
		X_train.append(char_list)
		
		# Appends labels
		if line[labelcol] == labels[0]:
			Y_train.append(0)
		if line[labelcol] == labels[1]:
			Y_train.append(1)
		if line[labelcol] == labels[2]:
			Y_train.append(2)
	
	# Converts Y_train to a numpy array	
	Y_train = np.asarray(Y_train)
	assert(len(X_train) == Y_train.shape[0])

	return [X_train,Y_train]

In [None]:
# Purpose -> Convert characters to integers, a unique value for every character
# Input   -> Training data (In list of lists format) along with global variables
# Output  -> Converted training data along with global variables
def convert_char2num(mapping_n2c,mapping_c2n,trainwords,maxlen):
	allchars = []
	errors = 0

	# Creates a list of all characters present in the dataset
	for line in trainwords:
		try:
			allchars = set(allchars+line)
			allchars = list(allchars)
		except:
			errors += 1

	# Creates character dictionaries for the characters
	charno = 0
	for char in allchars:
		mapping_char2num[char] = charno
		mapping_num2char[charno] = char
		charno += 1

  # Checks
	assert(len(allchars)==charno)

	# Converts the data from characters to numbers using dictionaries 
	X_train = []
	for line in trainwords:
		char_list=[]
		for letter in line:
			char_list.append(mapping_char2num[letter])
		X_train.append(char_list)
	print(mapping_char2num)
	print(mapping_num2char)
 
	# Pads the X_train to get a uniform vector
	X_train = pad_sequences(X_train[:], maxlen=maxlen)
	return [X_train,mapping_num2char,mapping_char2num,charno]

In [None]:
# Purpose -> Define and train the proposed LSTM network
# Input   -> Data, Labels and model hyperparameters
# Output  -> Trained LSTM network
def RNN(X_train,y_train,args):
	# Sets the model hyperparameters
	# Embedding hyperparameters
	max_features = args[0]
	maxlen = args[1]
	embedding_size = args[2]

	# Convolution hyperparameters
	filter_length = args[3]
	nb_filter = args[4]
	pool_length = args[5]

	# LSTM hyperparameters
	lstm_output_size = args[6]

	# Training hyperparameters
	batch_size = args[7]
	nb_epoch = args[8]
	numclasses = args[9]
	test_size = args[10] 

	# Format conversion for y_train for compatibility with Keras
	y_train = np_utils.to_categorical(y_train, numclasses) 
 
	# Train & Validation data splitting
	X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=test_size, random_state=42)
	
	# Build the sequential model
	# Model Architecture is:
	# Input -> Embedding -> Conv1D+Maxpool1D -> LSTM -> LSTM -> MLP -> Softmax
	print('Build model...')
	model = Sequential()
	model.add(Embedding(max_features, embedding_size, input_length=maxlen))
	model.add(Convolution1D(filters=nb_filter,
							kernel_size=filter_length,
							padding='valid',
							activation='relu',
							strides=1))
	model.add(MaxPooling1D(pool_size=pool_length))
	model.add(LSTM(lstm_output_size, dropout=0.2, return_sequences=True))
	model.add(LSTM(lstm_output_size, dropout=0.2, return_sequences=False))
	model.add(Dense(numclasses))
	model.add(Activation('softmax'))

	# Optimizer is Adamax along with categorical crossentropy loss
	model.compile(loss='categorical_crossentropy',
			  	optimizer='adamax',
			  	metrics=['accuracy'])
	

	print('Train...')
	# Trains model for 50 epochs with shuffling after every epoch for training data and validates on validation data
	model.fit(X_train, y_train,
			  batch_size=batch_size,
			  shuffle=True,
			  epochs=nb_epoch,
			  validation_data=(X_valid, y_valid))
	return model

In [None]:
# Purpose -> Saves Keras model files to the given directory
# Input   -> Directory and experiment details to be saved and trained model file
# Output  -> Nil
def save_model(Masterdir,filename,model):
	model.save_weights(Masterdir + Modeldir + 'LSTM_' + filename + '_weights.h5')
	json_string = model.to_json()
	f = open(Masterdir + Modeldir + 'LSTM_' + filename + '_architecture.json','w')
	f.write(json_string)
	f.close()

In [None]:
# Purpose -> Obtains outputs from any layer in Keras
# Input   -> Trained model, layer from which output needs to be extracted & files to be given as input
# Output  -> Features from that layer 
def get_activations(model, layer, X_batch):
	get_activations = K.function([model.input], [model.layers[layer].output])
	activations = get_activations(X_batch)
	return activations

In [None]:
# Purpose -> Evaluate any model on the testing data
# Input   -> Testing data and labels, trained model and global variables
# Output  -> Nil
def evaluate_model(X_test,y_test,model,batch_size,numclasses):
	# Convert y_test to one-hot encoding
	y_test = np_utils.to_categorical(y_test, numclasses)
 
	# Evaluate the accuracies
	score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
	print('Test score:', score)
	print('Test accuracy:', acc)

In [None]:
# Purpose -> Saves train, test data along with labels and features in the respective directories in the folder
# Input   -> Train and test data, labels and features along with the directory and experiment details to be mentioned
# Output  -> Nil
def save_data(Masterdir,filename,X_train,X_test,y_train,y_test,features_train,features_test):
	"""
	"""
	h5f = h5py.File(Masterdir + Datadir + 'Xtrain_' + filename + '.h5', 'w')
	h5f.create_dataset('dataset', data=X_train)
	h5f.close()

	h5f = h5py.File(Masterdir + Datadir + 'Xtest_' + filename + '.h5', 'w')
	h5f.create_dataset('dataset', data=X_test)
	h5f.close()

	output = open(Masterdir + Datadir + 'Ytrain_' + filename + '.pkl', 'wb')
	pickle.dump([y_train], output)
	output.close()

	output = open(Masterdir + Datadir + 'Ytest_' + filename + '.pkl', 'wb')
	pickle.dump([y_test], output)
	output.close()

	h5f = h5py.File(Masterdir + Featuredir + 'features_train_' + filename + '.h5', 'w')
	h5f.create_dataset('dataset', data=features_train)
	h5f.close()

	h5f = h5py.File(Masterdir + Featuredir + 'features_test_' + filename + '.h5', 'w')
	h5f.create_dataset('dataset', data=features_test)
	h5f.close()

In [None]:
# Training
print('Starting RNN Engine...\nModel: Char-level LSTM.\nParsing data files...')
out = parse(Masterdir,inputdatasetfilename,SEPERATOR,DATA_COLUMN,LABEL_COLUMN,LABELS)
X_train = out[0]
y_train = out[1]
print('Parsing complete!')

print('Creating character dictionaries and format conversion in progess...')
out = convert_char2num(mapping_num2char,mapping_char2num,X_train,MAXLEN)
mapping_num2char = out[1]
mapping_char2num = out[2]
MAX_FEATURES = out[3]
X_train = np.asarray(out[0])
y_train = np.asarray(y_train).flatten()
print('Complete!')

print('Splitting data into train and test...')
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Creating LSTM Network...')
model = RNN(deepcopy(X_train),deepcopy(y_train),[MAX_FEATURES, MAXLEN, embedding_size,\
          filter_length, nb_filter, pool_length, lstm_output_size, batch_size, \
          number_of_epochs, numclasses, test_size])

print('Evaluating model...')
evaluate_model(X_test,deepcopy(y_test),model,batch_size,numclasses)

print('Feature extraction pipeline running...')
activations = get_activations(model, 4, X_train)
features_train = np.asarray(activations)
activations = get_activations(model, 4, X_test)
features_test = np.asarray(activations)
print('Features extracted!')

print('Saving experiment...')
save_model(Masterdir,experiment_details,model)
save_data(Masterdir,experiment_details,X_train,X_test,y_train,y_test,features_train,features_test)
print('Saved! Experiment finished!')

Starting RNN Engine...
Model: Char-level LSTM.
Parsing data files...
Parsing complete!
Creating character dictionaries and format conversion in progess...
{'o': 0, 'v': 1, 'u': 2, 'z': 3, 'g': 4, 'l': 5, 'h': 6, 's': 7, 'c': 8, 't': 9, 'n': 10, 'e': 11, 'r': 12, 'q': 13, 'w': 14, 'p': 15, 'f': 16, 'm': 17, 'x': 18, 'b': 19, ' ': 20, 'y': 21, 'a': 22, 'd': 23, 'j': 24, 'k': 25, 'i': 26}
{0: 'o', 1: 'v', 2: 'u', 3: 'z', 4: 'g', 5: 'l', 6: 'h', 7: 's', 8: 'c', 9: 't', 10: 'n', 11: 'e', 12: 'r', 13: 'q', 14: 'w', 15: 'p', 16: 'f', 17: 'm', 18: 'x', 19: 'b', 20: ' ', 21: 'y', 22: 'a', 23: 'd', 24: 'j', 25: 'k', 26: 'i'}
Complete!
Splitting data into train and test...
X_train shape: (3103, 200)
X_test shape: (776, 200)
Creating LSTM Network...
Build model...
Train...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/

In [None]:
# Accuracy
def accuracy(original, predicted):
	print("F1 score is: " + str(f1_score(original, predicted, average='macro')))
	scores = confusion_matrix(original, predicted)
	print(scores)
	print(np.trace(scores)/float(np.sum(scores)))

In [None]:
# Testing
h5f = h5py.File(Masterdir + Datadir + 'Xtest_' + experiment_details + '.h5','r')
X_test = h5f['dataset'][:]
h5f.close()
print(X_test.shape)

inp = open(Masterdir + Datadir + 'Ytest_' + experiment_details + '.pkl', 'rb')
y_test=pickle.load(inp)
inp.close()
y_test=np.asarray(y_test).flatten()
y_test2 = np_utils.to_categorical(y_test, numclasses) 
print(y_test.shape)
f = open(Masterdir + Modeldir + 'LSTM_' + experiment_details + '_architecture.json','r+')
json_string = f.read()
f.close()
model = model_from_json(json_string)

model.load_weights(Masterdir + Modeldir + 'LSTM_' + experiment_details + '_weights.h5')
model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])

score, acc = model.evaluate(X_test, y_test2, batch_size=batch_size)

y_pred = model.predict(X_test, batch_size=batch_size)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy(y_test,y_pred_classes)

print('Accuracy is: '+str(acc))

(776, 200)
(776,)
F1 score is: 0.610043376378226
[[ 41  67   8]
 [ 11 353  31]
 [  4 129 132]]
0.6778350515463918
Accuracy is: 0.6778350472450256
