## **Setting up Development Environment**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd /content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Tanglish

/content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Tanglish


In [4]:
! pip install -U -q transformers
! pip install -U -q sentencepiece

[K     |████████████████████████████████| 2.9 MB 5.1 MB/s 
[K     |████████████████████████████████| 895 kB 61.6 MB/s 
[K     |████████████████████████████████| 56 kB 4.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 45.3 MB/s 
[K     |████████████████████████████████| 636 kB 43.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 5.2 MB/s 
[?25h

## **Importing Libraries**

In [5]:
import pandas as pd

import numpy as np

import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from transformers import BertModel, BertTokenizer

import pickle

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Loading Data**

In [6]:
dataframe = pd.read_csv("Dataset/Tamil_hasoc_tanglish_test_withlabels.tsv", sep='\t')

In [7]:
dataframe.head()

Unnamed: 0,id,text,category
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...,OFF
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...,NOT
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka...",OFF
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...,OFF
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...,OFF


In [8]:
dataframe.describe()

Unnamed: 0,id,text,category
count,940,940,940
unique,939,933,2
top,TA_TW13798,RT : Full Day Time Iruku !! 1 person 700 Twee...,OFF
freq,2,5,475


In [9]:
dataframe['category'].unique()

array(['OFF', 'NOT', nan], dtype=object)

In [10]:
dataframe.drop_duplicates(subset ="id",keep = False, inplace = True)
dataframe.dropna()
dataframe.describe()

Unnamed: 0,id,text,category
count,938,938,938
unique,938,932,2
top,TA_TW13104,RT : Full Day Time Iruku !! 1 person 700 Twee...,OFF
freq,1,5,473


In [11]:
dataframe['category'].unique()

array(['OFF', 'NOT'], dtype=object)

In [12]:
text = dataframe['text']
text

0      Take it this thevidiya Kandipa indha page admi...
1      enga veetla itha nadakum Athum oru varushama t...
2      Indha Sallli Punda, Dummy Pundalam Orama Iruka...
3      Juriya poola tier 1 la umburan tha kulla punda...
4      Kullans lam umba therila Loosu kuthi maari umb...
                             ...                        
934    Watha  manasaatchi punda irukka daa unakuu... ...
935    Woman with a bhindi picture on a logo promotes...
936    Worst ra Vara vara Namma Society kevalama poit...
937    Yeallarukum reply pandringa namma #TAG RakidaR...
938    Yeva yeva valkaiyelam flashlight adicha maari ...
Name: text, Length: 938, dtype: object

In [13]:
label = dataframe['category']
label

0      OFF
1      NOT
2      OFF
3      OFF
4      OFF
      ... 
934    OFF
935    OFF
936    OFF
937    NOT
938    NOT
Name: category, Length: 938, dtype: object

## **Data Preprocessing**

In [14]:
le = LabelEncoder()
label = le.fit_transform(label)

In [15]:
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+",' ', text)
    text = re.sub(r"#[A-Za-z0-9]+",' ', text)
    text = re.sub(r": +",' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+",' ', text)
    text = re.sub("RT ", " ", text)
    text = re.sub("\n", " ", text)
    text = re.sub(r" +", " ", text)
    tokens = []
    for token in text.split():
        if token not in english_stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

text = [clean_text(i) for i in text]

text = np.array(text)

## **Feature Extraction**

In [16]:
# Load Transformer Model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
# Sample Output

tokenized_input = tokenizer(
        text[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [18]:
print(tokenized_input)

{'input_ids': tensor([[  101,  2202,  1996, 17258,  8717, 22827,  4305,  4502, 27427,  3270,
          3931,  4748, 10020,  2030,  2226,  8203, 26136,  2850, 17027,  5470,
          5292, 28144,  2319,  9413,  6279,  2319,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}


In [19]:
print(sample_output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.6253, -0.1190,  0.0041,  ..., -0.3485,  0.2976,  0.3229],
         [-0.2394,  0.0336,  0.5398,  ..., -0.0825,  0.4288,  0.0590],
         [-0.8110, -0.7738, -0.5849,  ...,  0.4192,  0.7142, -0.3646],
         ...,
         [ 0.5793,  0.3475, -0.1441,  ..., -0.1654, -0.5770, -0.2730],
         [-0.1861,  0.0129, -0.4758,  ..., -0.3888, -0.2861,  0.2593],
         [ 0.4556,  0.1789, -0.0718,  ...,  0.1174, -0.3462,  0.0113]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.7491, -0.4548, -0.4597,  0.6349,  0.2995, -0.0407,  0.7387,  0.3012,
          0.1202, -0.9999,  0.0898,  0.4543,  0.9732, -0.1028,  0.8099, -0.4019,
         -0.0739, -0.5381,  0.4283, -0.2793,  0.5231,  0.9982,  0.4657,  0.2171,
          0.3105,  0.6480, -0.5481,  0.8431,  0.9205,  0.7028, -0.4274,  0.1808,
         -0.9771, -0.2144, -0.6255, -0.9830,  0.1855, -0.6078, -0.0442,  0.0749,
         -0.7606,  0.2955,  0.999

In [20]:
print(sample_output.pooler_output.cpu().detach().numpy().shape)

(1, 768)


In [21]:
# Tokenizing Input Data

input = []

for i in text:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [22]:
# Embedding the Input Data

output = []

j = 1

for i in range(len(input)):
    
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    
    # Saving Embedded Input Data to Disk

    if ((i + 1) % 200) == 0:
        
        file_name = "./MaskedLM-Embedded-TrainData/output" + str(j) + ".pickle"
        
        with open(file_name, "wb") as fp:   
            # Pickling
            pickle.dump(output, fp)
        
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./MaskedLM-Embedded-TrainData/output" + str(j) + ".pickle"

with open(file_name, "wb") as fp:   
    # Pickling
    pickle.dump(output, fp)

print(file_name + " done")
output = []

./MaskedLM-Embedded-TrainData/output1.pickle done
./MaskedLM-Embedded-TrainData/output2.pickle done
./MaskedLM-Embedded-TrainData/output3.pickle done
./MaskedLM-Embedded-TrainData/output4.pickle done
./MaskedLM-Embedded-TrainData/output5.pickle done


In [23]:
# Loading Embedded Input Data from Disk

output = []

for i in range(5):
    file_name = "./MaskedLM-Embedded-TrainData/output" + str(i + 1) + ".pickle"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

X = output
output = []

./MaskedLM-Embedded-TrainData/output1.pickle done
./MaskedLM-Embedded-TrainData/output2.pickle done
./MaskedLM-Embedded-TrainData/output3.pickle done
./MaskedLM-Embedded-TrainData/output4.pickle done
./MaskedLM-Embedded-TrainData/output5.pickle done


In [24]:
X = np.array(X)
X.shape

(938, 1, 768)

In [25]:
X = X.reshape(938, 768)
X.shape

(938, 768)

In [26]:
y = label
y.shape

(938,)

## **Test Train Split**

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
X_train.shape

(750, 768)

In [29]:
X_val.shape

(188, 768)

## **Logistic Regression**

In [38]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.73      0.78      0.75        86
           1       0.80      0.75      0.78       102

    accuracy                           0.77       188
   macro avg       0.77      0.77      0.77       188
weighted avg       0.77      0.77      0.77       188



## **Support Vector Machine**

In [31]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print(classification_report(y_val, svm_pred))

              precision    recall  f1-score   support

           0       0.60      0.76      0.67        86
           1       0.74      0.58      0.65       102

    accuracy                           0.66       188
   macro avg       0.67      0.67      0.66       188
weighted avg       0.68      0.66      0.66       188



## **Naive Bayes**

In [32]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

           0       0.65      0.63      0.64        86
           1       0.70      0.72      0.71       102

    accuracy                           0.68       188
   macro avg       0.67      0.67      0.67       188
weighted avg       0.67      0.68      0.68       188



## **Stochastic Gradient Descent**

In [33]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

           0       0.50      0.97      0.66        86
           1       0.86      0.19      0.31       102

    accuracy                           0.54       188
   macro avg       0.68      0.58      0.48       188
weighted avg       0.70      0.54      0.47       188



## **K Nearest Neighbours**

In [34]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

           0       0.59      0.66      0.62        86
           1       0.68      0.61      0.64       102

    accuracy                           0.63       188
   macro avg       0.63      0.64      0.63       188
weighted avg       0.64      0.63      0.63       188



## **Decision Tree**

In [35]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.56      0.63      0.59        86
           1       0.65      0.58      0.61       102

    accuracy                           0.60       188
   macro avg       0.60      0.60      0.60       188
weighted avg       0.61      0.60      0.60       188



## **Random Forest**

In [36]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.66      0.77      0.71        86
           1       0.77      0.67      0.72       102

    accuracy                           0.71       188
   macro avg       0.72      0.72      0.71       188
weighted avg       0.72      0.71      0.71       188



## **Majority Voting**

In [39]:
mv_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    # Including KNN, SGD & RF in predictions
    predictions = [lr_pred[i], rf_pred[i], nb_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: mv_pred.append(1)
    else: mv_pred.append(0)

mv_pred = np.array(mv_pred)

print(classification_report(y_val, mv_pred))

              precision    recall  f1-score   support

           0       0.69      0.79      0.74        86
           1       0.80      0.70      0.74       102

    accuracy                           0.74       188
   macro avg       0.74      0.74      0.74       188
weighted avg       0.75      0.74      0.74       188

