###Installation & Imports

In [None]:
from google.colab import drive
#Mount google drive for retrive files
drive.mount('/content/drive')

In [None]:
!pip install sentencepiece
!pip install emoji
!pip install emot
!pip install nltk

In [None]:
from fastai.text import *
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import sentencepiece as spm
import re
import pdb

from nltk.corpus import words
from bs4 import BeautifulSoup
import nltk
nltk.download('words')
import nltk, string, re, spacy,unicodedata, random
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import ToktokTokenizer
import nltk, string, re, spacy,unicodedata, random

In [None]:
import fastai, torch
fastai.__version__ , torch.__version__

In [None]:
torch.cuda.set_device(0)

In [None]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [None]:
random_seed(42, True)

In [None]:
path = Path('/tmp')

In [None]:
!pwd

###Import Data

In [None]:
df_train = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/data/tamil_sentiment_full_train.csv', header=None)
df_train = df_train.dropna()
df_train

In [None]:
print(len(df_train))

In [None]:
df_valid = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/data/tamil_sentiment_full_dev.csv', header=None)
df_valid = df_valid.dropna()
df_valid

In [None]:
df_test = pd.read_csv('/content/drive/My Drive/Sentiment Analysis Fire/data/tamil_sentiment_full_test_withflabels.csv', header=None)
df_test

###Preprocessing

In [None]:
def convert_emoticons(text):
  for emot in EMOTICONS:
    text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def preprocess(text):
  text = emoji.demojize(text) #convert emojis to their defns in words, they might be useful
  text = convert_emoticons(text)
  text = re.sub(r'([\.\'\"\/\-\_\--])',' ', text) # remove punctuations , removes @USER / some abbreviatins
  to_remove_url = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
      '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
  text = re.sub(to_remove_url,'',text)  # remove url patterns
  text = re.sub(" \d+", " ", text)
  text = text.replace(","," ")
  text = re.sub(r'(?:^| )\w(?:$| )', ' ', text).strip()
  punctuation='!!"$%&()*+-/:;<=>?[\\]^_{|}~.'
  text = ''.join(ch for ch in text if ch not in set(punctuation))
  # text = text.translate(str.maketrans('', '', string.punctuation))
  text = BeautifulSoup(text, 'html.parser').get_text()
    # Stopword Removing
  tokenizer = ToktokTokenizer()
  # convert sentence into token of words
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  text = ' '.join(ch for ch in tokens)
  return text 

def clean(df):
  df['text'] = df['text'].apply(lambda x: preprocess(x))

clean(train)
clean(val)
clean(test)

###Model

In [None]:
df_train.shape, df_valid.shape, df_test.shape

In [None]:
df_train[df_train[0].isnull()].shape, df_valid[df_valid[0].isnull()].shape, df_test[df_test[0].isnull()].shape

In [None]:
class TamilTokenizer(BaseTokenizer):    
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str('/content/drive/My Drive/Sentiment Analysis Fire/tokenizer/tamil_spm_8k.model'))
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [None]:
sp = spm.SentencePieceProcessor()
sp.Load(str('/content/drive/My Drive/Sentiment Analysis Fire/tokenizer/tamil_spm_8k.model'))
itos = [sp.IdToPiece(int(i)) for i in range(8000)]

In [None]:
tamil_vocab = Vocab(itos)

In [None]:
tokenizer = Tokenizer(tok_func=TamilTokenizer, lang='ta')

In [None]:
label_cols = [0]

In [None]:
data_lm = TextLMDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=tamil_vocab)

In [None]:
data_lm.show_batch()

In [None]:
awd_lstm_config = awd_lstm_lm_config.copy()
awd_lstm_config['n_hid'] = 1150
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.3, config=awd_lstm_config, pretrained=False)

In [None]:
learn.load('/content/drive/My Drive/Sentiment Analysis Fire/models/wikitalm_8k_447_third')

In [None]:
learn.freeze()

###Training

In [None]:
learn.fit_one_cycle(5, 1e-2)

In [None]:
learn.unfreeze()

In [None]:
learn.fit_one_cycle(10, 1e-3)

In [None]:
learn.predict('▁தனது ▁சொந்த',n_words=10)

In [None]:
learn.save_encoder('/content/drive/My Drive/Sentiment Analysis Fire/models/preprocessed/ulmfit')

In [None]:
data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=tamil_vocab, label_cols=label_cols, bs=64)

In [None]:
data_clas.show_batch()

In [None]:
del awd_lstm_config['tie_weights']
del awd_lstm_config['out_bias']

In [None]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.5, config=awd_lstm_config)

In [None]:
learn.load_encoder('/content/drive/My Drive/Sentiment Analysis Fire/models/preprocessed/ulmfit')

In [None]:
learn.freeze()

In [None]:
learn.loss_func.func

In [None]:
mcc = MatthewsCorreff()

In [None]:
learn.metrics = [mcc, accuracy]

In [None]:
learn.fit_one_cycle(10, 1e-2)

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(10, 1e-2)

In [None]:
learn.unfreeze()
learn.fit_one_cycle(30, 1e-3, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='final')])

###Predictions

In [None]:
from sklearn.metrics import accuracy_score, matthews_corrcoef
df_dict = {'query': list(df_test[1]), 'actual_label': list(df_test[0]), 'predicted_label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train[0]))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Test, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['predicted_label'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head(50)

In [None]:
accuracy_score(df_result['actual_label'], df_result['predicted_label'])

In [None]:
matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])

###Write to CSV

In [None]:
df_result = df_result[['query','predicted_label']]
df_result.to_csv('/content/drive/My Drive/Sentiment Analysis Fire/output/preprocessed/UMLFit.csv', index=False, header=None)