## **Setting up Development Environment**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd /content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Tanglish

/content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Tanglish


In [4]:
! pip install -U -q transformers
! pip install -U -q sentencepiece

[K     |████████████████████████████████| 2.9 MB 5.1 MB/s 
[K     |████████████████████████████████| 56 kB 4.0 MB/s 
[K     |████████████████████████████████| 895 kB 46.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 37.2 MB/s 
[K     |████████████████████████████████| 636 kB 54.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 5.1 MB/s 
[?25h

## **Importing Libraries**

In [5]:
import pandas as pd

import numpy as np

import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from transformers import RobertaTokenizer, RobertaModel

import pickle

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Loading Data**

In [6]:
dataframe = pd.read_csv("Dataset/Tamil_hasoc_tanglish_test_withlabels.tsv", sep='\t')

In [7]:
dataframe.head()

Unnamed: 0,id,text,category
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...,OFF
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...,NOT
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka...",OFF
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...,OFF
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...,OFF


In [8]:
dataframe.describe()

Unnamed: 0,id,text,category
count,940,940,940
unique,939,933,2
top,TA_TW13798,RT : Full Day Time Iruku !! 1 person 700 Twee...,OFF
freq,2,5,475


In [9]:
dataframe['category'].unique()

array(['OFF', 'NOT', nan], dtype=object)

In [10]:
dataframe.drop_duplicates(subset ="id",keep = False, inplace = True)
dataframe.dropna()
dataframe.describe()

Unnamed: 0,id,text,category
count,938,938,938
unique,938,932,2
top,TA_TW15673,RT : Full Day Time Iruku !! 1 person 700 Twee...,OFF
freq,1,5,473


In [11]:
dataframe['category'].unique()

array(['OFF', 'NOT'], dtype=object)

In [12]:
text = dataframe['text']
text

0      Take it this thevidiya Kandipa indha page admi...
1      enga veetla itha nadakum Athum oru varushama t...
2      Indha Sallli Punda, Dummy Pundalam Orama Iruka...
3      Juriya poola tier 1 la umburan tha kulla punda...
4      Kullans lam umba therila Loosu kuthi maari umb...
                             ...                        
934    Watha  manasaatchi punda irukka daa unakuu... ...
935    Woman with a bhindi picture on a logo promotes...
936    Worst ra Vara vara Namma Society kevalama poit...
937    Yeallarukum reply pandringa namma #TAG RakidaR...
938    Yeva yeva valkaiyelam flashlight adicha maari ...
Name: text, Length: 938, dtype: object

In [13]:
label = dataframe['category']
label

0      OFF
1      NOT
2      OFF
3      OFF
4      OFF
      ... 
934    OFF
935    OFF
936    OFF
937    NOT
938    NOT
Name: category, Length: 938, dtype: object

## **Data Preprocessing**

In [14]:
le = LabelEncoder()
label = le.fit_transform(label)

In [15]:
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+",' ', text)
    text = re.sub(r"#[A-Za-z0-9]+",' ', text)
    text = re.sub(r": +",' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+",' ', text)
    text = re.sub("RT ", " ", text)
    text = re.sub("\n", " ", text)
    text = re.sub(r" +", " ", text)
    tokens = []
    for token in text.split():
        if token not in english_stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

text = [clean_text(i) for i in text]

text = np.array(text)

## **Feature Extraction**

In [16]:
# Load Transformer Model

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
# Sample Output

tokenized_input = tokenizer(
        text[0],
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [18]:
print(tokenized_input)

{'input_ids': tensor([[    0, 16111,     5, 21157, 10252,   449,   463,  1588,   102,  9473,
          1999,  1842, 28665,    50,   257,  7628, 30297,   102,   748,  2161,
           857,  2378,  2489,   385,  4134,  3335,   658,   260,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}


In [19]:
print(sample_output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0376,  0.0075,  0.0266,  ..., -0.1027, -0.0101, -0.0068],
         [ 0.0135, -0.0989,  0.0440,  ..., -0.0780,  0.0679,  0.1436],
         [-0.2444,  0.0302,  0.1901,  ..., -0.6560,  0.0514, -0.1104],
         ...,
         [-0.0641, -0.0974,  0.1978,  ..., -0.6803,  0.1431, -0.0845],
         [ 0.0808, -0.2243,  0.0170,  ...,  0.0451,  0.0058,  0.0483],
         [-0.0341, -0.0081,  0.0182,  ..., -0.1366, -0.0037, -0.0236]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-1.6409e-02, -2.2382e-01, -2.1544e-01, -1.0841e-01,  1.3327e-01,
          1.7188e-01,  2.5896e-01, -8.7242e-02, -6.0132e-02, -1.3983e-01,
          2.6041e-01, -1.5709e-02, -9.3618e-02,  8.7311e-02, -1.4475e-01,
          4.8654e-01,  2.0858e-01, -4.6549e-01,  4.3737e-02, -1.3001e-02,
         -2.4173e-01,  3.5561e-02,  4.8237e-01,  3.1196e-01,  1.1478e-01,
          7.4317e-02, -1.3974e-01, -5.4115e-02,  2.1035e-01,  2.2992

In [20]:
print(sample_output.pooler_output.cpu().detach().numpy().shape)

(1, 768)


In [21]:
# Tokenizing Input Data

input = []

for i in text:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [22]:
# Embedding the Input Data

output = []

j = 1

for i in range(len(input)):
    
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    
    # Saving Embedded Input Data to Disk

    if ((i + 1) % 200) == 0:
        
        file_name = "./XLM-Roberta-Embedded-TrainData/output" + str(j) + ".pickle"
        
        with open(file_name, "wb") as fp:   
            # Pickling
            pickle.dump(output, fp)
        
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./XLM-Roberta-Embedded-TrainData/output" + str(j) + ".pickle"

with open(file_name, "wb") as fp:   
    # Pickling
    pickle.dump(output, fp)

print(file_name + " done")
output = []

./XLM-Roberta-Embedded-TrainData/output1.pickle done
./XLM-Roberta-Embedded-TrainData/output2.pickle done
./XLM-Roberta-Embedded-TrainData/output3.pickle done
./XLM-Roberta-Embedded-TrainData/output4.pickle done
./XLM-Roberta-Embedded-TrainData/output5.pickle done


In [23]:
# Loading Embedded Input Data from Disk

output = []

for i in range(5):
    file_name = "./XLM-Roberta-Embedded-TrainData/output" + str(i + 1) + ".pickle"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

X = output
output = []

./XLM-Roberta-Embedded-TrainData/output1.pickle done
./XLM-Roberta-Embedded-TrainData/output2.pickle done
./XLM-Roberta-Embedded-TrainData/output3.pickle done
./XLM-Roberta-Embedded-TrainData/output4.pickle done
./XLM-Roberta-Embedded-TrainData/output5.pickle done


In [24]:
X = np.array(X)
X.shape

(938, 1, 768)

In [25]:
X = X.reshape(938, 768)
X.shape

(938, 768)

In [26]:
y = label
y.shape

(938,)

## **Test Train Split**

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
X_train.shape

(750, 768)

In [29]:
X_val.shape

(188, 768)

## **Logistic Regression**

In [30]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.65      0.65      0.65        86
           1       0.71      0.71      0.71       102

    accuracy                           0.68       188
   macro avg       0.68      0.68      0.68       188
weighted avg       0.68      0.68      0.68       188



## **Support Vector Machine**

In [31]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print(classification_report(y_val, svm_pred))

              precision    recall  f1-score   support

           0       0.46      1.00      0.63        86
           1       0.00      0.00      0.00       102

    accuracy                           0.46       188
   macro avg       0.23      0.50      0.31       188
weighted avg       0.21      0.46      0.29       188



  _warn_prf(average, modifier, msg_start, len(result))


## **Naive Bayes**

In [32]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

           0       0.64      0.57      0.60        86
           1       0.67      0.73      0.69       102

    accuracy                           0.65       188
   macro avg       0.65      0.65      0.65       188
weighted avg       0.65      0.65      0.65       188



## **Stochastic Gradient Descent**

In [33]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

           0       0.76      0.60      0.68        86
           1       0.72      0.84      0.77       102

    accuracy                           0.73       188
   macro avg       0.74      0.72      0.73       188
weighted avg       0.74      0.73      0.73       188



## **K Nearest Neighbours**

In [34]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

           0       0.64      0.58      0.61        86
           1       0.67      0.73      0.70       102

    accuracy                           0.66       188
   macro avg       0.66      0.65      0.65       188
weighted avg       0.66      0.66      0.66       188



## **Decision Tree**

In [35]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.60      0.65      0.62        86
           1       0.68      0.63      0.65       102

    accuracy                           0.64       188
   macro avg       0.64      0.64      0.64       188
weighted avg       0.64      0.64      0.64       188



## **Random Forest**

In [36]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.69      0.79      0.74        86
           1       0.80      0.71      0.75       102

    accuracy                           0.74       188
   macro avg       0.75      0.75      0.74       188
weighted avg       0.75      0.74      0.75       188



## **Majority Voting**

In [38]:
mv_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    # Including LR, SGD & RF in predictions
    predictions = [rf_pred[i], sgd_pred[i], lr_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: mv_pred.append(1)
    else: mv_pred.append(0)

mv_pred = np.array(mv_pred)

print(classification_report(y_val, mv_pred))

              precision    recall  f1-score   support

           0       0.67      0.65      0.66        86
           1       0.71      0.74      0.72       102

    accuracy                           0.70       188
   macro avg       0.69      0.69      0.69       188
weighted avg       0.70      0.70      0.70       188

