###Installatioin

In [None]:
!pip install transformers
!pip install emoji
!pip install emot
!pip install tensorflow
!pip install tensorflow-gpu
!pip install nltk

###Imports

In [None]:
import os
import pandas as pd
import re
import emoji
from emot.emo_unicode import EMOTICONS
import seaborn as sns
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, LSTM
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from matplotlib import pyplot as plt
from google.colab import drive
import tensorflow as tf

from nltk.corpus import words
from bs4 import BeautifulSoup
import nltk
nltk.download('words')
import nltk, string, re, spacy,unicodedata, random
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import ToktokTokenizer
import nltk, string, re, spacy,unicodedata, random

In [None]:
#Mount google drive for retrive files
drive.mount('/content/drive')

###Import Data

In [None]:
#Load Training data
train = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/data/tamil_sentiment_full_train.csv', names=['category','text'])
train.head(6)

In [None]:
#Visualize Train
train = train[['text', 'category']]
sns.countplot(x='category', data=train)

In [None]:
#Load Validation data
val = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/data/tamil_sentiment_full_dev.csv', names=['category','text'])
val.head(6)

In [None]:
#Visualize Val
val = val[['text', 'category']]
sns.countplot(x='category', data=val)

In [None]:
#Load test data
test = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/data/tamil_sentiment_full_test_withoutlabels.csv', names=['text'])
test.head(9)

In [None]:
# Select required columns
train = train[['text', 'category']]
val = val[['text', 'category']]

# Remove a row if any of the two remaining columns are missing
train = train.dropna();
val = val.dropna()

# Set your model output as categorical and save in new label col
train['label_label'] = pd.Categorical(train['category'])
val['label_label'] = pd.Categorical(val['category'])

# Transform your output to numeric
train['category'] = train['label_label'].cat.codes
val['category'] = val['label_label'].cat.codes

###Preprocess

In [None]:
def convert_emoticons(text):
  for emot in EMOTICONS:
    text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def preprocess(text):
  text = emoji.demojize(text) #convert emojis to their defns in words, they might be useful
  text = convert_emoticons(text)
  text = re.sub(r'([\.\'\"\/\-\_\--])',' ', text) # remove punctuations , removes @USER / some abbreviatins
  to_remove_url = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
      '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
  text = re.sub(to_remove_url,'',text)  # remove url patterns
  text = re.sub(" \d+", " ", text)
  text = text.replace(","," ")
  text = re.sub(r'(?:^| )\w(?:$| )', ' ', text).strip()
  punctuation='!!"$%&()*+-/:;<=>?[\\]^_{|}~.'
  text = ''.join(ch for ch in text if ch not in set(punctuation))
  # text = text.translate(str.maketrans('', '', string.punctuation))
  text = BeautifulSoup(text, 'html.parser').get_text()
    # Stopword Removing
  tokenizer = ToktokTokenizer()
  # convert sentence into token of words
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  text = ' '.join(ch for ch in tokens)
  return text 

def clean(df):
  df['text'] = df['text'].apply(lambda x: preprocess(x))

clean(train)
clean(val)
clean(test)

In [None]:
#Setup BERT

# Name of the BERT model to use
model_name = 'bert-base-multilingual-uncased'

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = True

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

In [None]:
# Load the MainLayer
bert = transformer_model.layers[0]

# Max length of tokens
max_length = 100

# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=True)

# Then build your model output
label = Dense(units=len(train.label_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='category')(pooled_output)
relu_output = Dense(len(train.label_label.value_counts()), input_dim=len(train.label_label.value_counts()), activation='relu')(label)
sigmoid_output = Dense(5, activation='sigmoid')(relu_output)
outputs = {'category': sigmoid_output}

# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='bert-base-multilingual-uncased')

# Take a look at the model
model.summary()

In [None]:
#Train the model

# Set an optimizer
optimizer = Adam(
    learning_rate=1e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = {'category': CategoricalCrossentropy()}
metric = {'category': CategoricalAccuracy('accuracy')}

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Ready output data for the model
y_label = to_categorical(train['category'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=train['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    y={'category': y_label},
    validation_split=0.2,
    batch_size=64,
    epochs=10)

In [None]:
model.save("/content/drive/MyDrive/Sentiment Analysis Fire/models/preprocessed/mBERT-uncased.h5")

In [None]:
#Evaluate the model

# Ready test data`
test_y_label = to_categorical(val['encode_cat'])
test_x = tokenizer(
    text=val['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids'], 'attention_mask': test_x['attention_mask']},
    y={'category': test_y_label}
)

In [None]:
#History object
history_dict = history.history
history_dict.keys()

In [None]:
#plot the training and validation loss for comparison

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
#plot the training and validation accuracy for comparison
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
test_x = tokenizer(
    text=test['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

predictions = model.predict({'input_ids': test_x['input_ids'], 'attention_mask': test_x['attention_mask']})

In [None]:
print(test['text'])
print(len(predictions['category']))

###Write to CSV

In [None]:
#Write out to the csv file
arry = []

data_test = np.array(test)
label_array = np.array(predictions['category'])
index = 1
for i, j in zip(data_test, label_array):
  text = i[0]
  label_value_max = max(j)
  label_index = np.where(label_array == label_value_max)

  labels = ['unknown_state','Positive','Negative','Mixed_feelings','not-Tamil']
  arry.append([index, text, labels[label_index[1][0]]])
  #print(i[0] + ' - ' + labels[label_index[1][0]])
  index = index + 1


pre = pd.DataFrame(arry, columns=['text', 'category'])
pre.to_csv('/content/drive/My Drive/Sentiment Analysis Fire/output/preprocessed/mBERT-uncased.csv', header=None, index=False)