## **Setting up Development Environment**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Tanglish

/content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Tanglish


In [3]:
! pip install -U -q transformers
! pip install -U -q sentencepiece

[K     |████████████████████████████████| 2.9 MB 14.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 65.1 MB/s 
[K     |████████████████████████████████| 895 kB 71.7 MB/s 
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
[K     |████████████████████████████████| 636 kB 68.7 MB/s 
[K     |████████████████████████████████| 1.2 MB 12.2 MB/s 
[?25h

## **Importing Libraries**

In [4]:
import pandas as pd

import numpy as np

import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from transformers import AutoTokenizer, AutoModel

import pickle

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Loading Data**

In [5]:
dataframe = pd.read_csv("Dataset/Tamil_hasoc_tanglish_test_withlabels.tsv", sep='\t')

In [6]:
dataframe.head()

Unnamed: 0,id,text,category
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...,OFF
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...,NOT
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka...",OFF
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...,OFF
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...,OFF


In [7]:
dataframe.describe()

Unnamed: 0,id,text,category
count,940,940,940
unique,939,933,2
top,TA_TW13798,RT : Full Day Time Iruku !! 1 person 700 Twee...,OFF
freq,2,5,475


In [8]:
dataframe['category'].unique()

array(['OFF', 'NOT', nan], dtype=object)

In [9]:
dataframe.drop_duplicates(subset ="id",keep = False, inplace = True)
dataframe.dropna()
dataframe.describe()

Unnamed: 0,id,text,category
count,938,938,938
unique,938,932,2
top,TA_TW14240,RT : Full Day Time Iruku !! 1 person 700 Twee...,OFF
freq,1,5,473


In [10]:
dataframe['category'].unique()

array(['OFF', 'NOT'], dtype=object)

In [11]:
text = dataframe['text']
text

0      Take it this thevidiya Kandipa indha page admi...
1      enga veetla itha nadakum Athum oru varushama t...
2      Indha Sallli Punda, Dummy Pundalam Orama Iruka...
3      Juriya poola tier 1 la umburan tha kulla punda...
4      Kullans lam umba therila Loosu kuthi maari umb...
                             ...                        
934    Watha  manasaatchi punda irukka daa unakuu... ...
935    Woman with a bhindi picture on a logo promotes...
936    Worst ra Vara vara Namma Society kevalama poit...
937    Yeallarukum reply pandringa namma #TAG RakidaR...
938    Yeva yeva valkaiyelam flashlight adicha maari ...
Name: text, Length: 938, dtype: object

In [12]:
label = dataframe['category']
label

0      OFF
1      NOT
2      OFF
3      OFF
4      OFF
      ... 
934    OFF
935    OFF
936    OFF
937    NOT
938    NOT
Name: category, Length: 938, dtype: object

## **Data Preprocessing**

In [13]:
le = LabelEncoder()
label = le.fit_transform(label)

In [14]:
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+",' ', text)
    text = re.sub(r"#[A-Za-z0-9]+",' ', text)
    text = re.sub(r": +",' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+",' ', text)
    text = re.sub("RT ", " ", text)
    text = re.sub("\n", " ", text)
    text = re.sub(r" +", " ", text)
    tokens = []
    for token in text.split():
        if token not in english_stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

text = [clean_text(i) for i in text]

text = np.array(text)

## **Feature Extraction**

In [15]:
# Load Transformer Model

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModel.from_pretrained("ai4bharat/indic-bert")

Downloading:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/129M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.dense.bias', 'sop_classifier.classifier.weight', 'sop_classifier.classifier.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Sample Output

tokenized_input = tokenizer(
        text[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [17]:
print(tokenized_input)

{'input_ids': tensor([[     2,   1208,     11,  53120,  13690,  89728,    326, 132766,     26,
          29571,  25019,  32658,    388,   3067,  31698,   2804,  93131,      8,
         159891,  23496,  38726,      8,  63446,      8,    669,   3646,   1747,
              3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}


In [18]:
print(sample_output)

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-1.3630e-02,  6.3929e-04, -1.2081e-02,  ..., -2.3023e-02,
           1.9901e-03, -5.9263e-03],
         [ 4.4844e-01, -1.3437e-01, -3.7945e-01,  ...,  1.5648e-01,
           1.0581e-01, -2.6299e-01],
         [-2.4494e-01,  7.3175e-01,  1.4485e-01,  ...,  2.0364e-01,
           8.4302e-02, -8.5538e-02],
         ...,
         [ 1.6319e-02, -5.0222e-01, -4.1493e-02,  ...,  3.3591e-01,
           3.3522e-01,  8.0555e-02],
         [ 5.1483e-01, -6.1866e-02,  2.4022e-02,  ...,  3.1500e-01,
           1.8173e-01, -3.1239e-01],
         [-1.3630e-02,  6.3943e-04, -1.2080e-02,  ..., -2.3023e-02,
           1.9898e-03, -5.9266e-03]]], grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-5.4877e-02,  5.5557e-02,  1.3676e-02, -1.3421e-03,  4.3435e-02,
          8.3593e-02,  4.0740e-02, -1.3805e-02, -1.1765e-02,  6.0204e-02,
         -9.6217e-03,  4.9699e-02,  1.9730e-02, -1.7535e-02,  4.8543e-02,
         -1.8123e-02, -4.5716e-02, -1

In [19]:
print(sample_output.pooler_output.cpu().detach().numpy().shape)

(1, 768)


In [20]:
# Tokenizing Input Data

input = []

for i in text:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [21]:
# Embedding the Input Data

output = []

j = 1

for i in range(len(input)):
    
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    
    # Saving Embedded Input Data to Disk

    if ((i + 1) % 200) == 0:
        
        file_name = "./IndicBert-Embedded-TrainData/output" + str(j) + ".pickle"
        
        with open(file_name, "wb") as fp:   
            # Pickling
            pickle.dump(output, fp)
        
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./IndicBert-Embedded-TrainData/output" + str(j) + ".pickle"

with open(file_name, "wb") as fp:   
    # Pickling
    pickle.dump(output, fp)

print(file_name + " done")
output = []

./IndicBert-Embedded-TrainData/output1.pickle done
./IndicBert-Embedded-TrainData/output2.pickle done
./IndicBert-Embedded-TrainData/output3.pickle done
./IndicBert-Embedded-TrainData/output4.pickle done
./IndicBert-Embedded-TrainData/output5.pickle done


In [22]:
# Loading Embedded Input Data from Disk

output = []

for i in range(5):
    file_name = "./IndicBert-Embedded-TrainData/output" + str(i + 1) + ".pickle"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

X = output
output = []

./IndicBert-Embedded-TrainData/output1.pickle done
./IndicBert-Embedded-TrainData/output2.pickle done
./IndicBert-Embedded-TrainData/output3.pickle done
./IndicBert-Embedded-TrainData/output4.pickle done
./IndicBert-Embedded-TrainData/output5.pickle done


In [23]:
X = np.array(X)
X.shape

(938, 1, 768)

In [24]:
X = X.reshape(938, 768)
X.shape

(938, 768)

In [25]:
y = label
y.shape

(938,)

## **Test Train Split**

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
X_train.shape

(750, 768)

In [28]:
X_val.shape

(188, 768)

## **Logistic Regression**

In [29]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.55      0.70      0.61        86
           1       0.67      0.51      0.58       102

    accuracy                           0.60       188
   macro avg       0.61      0.60      0.60       188
weighted avg       0.61      0.60      0.59       188



## **Support Vector Machine**

In [30]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print(classification_report(y_val, svm_pred))

              precision    recall  f1-score   support

           0       0.46      1.00      0.63        86
           1       0.00      0.00      0.00       102

    accuracy                           0.46       188
   macro avg       0.23      0.50      0.31       188
weighted avg       0.21      0.46      0.29       188



  _warn_prf(average, modifier, msg_start, len(result))


## **Naive Bayes**

In [31]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60        86
           1       0.66      0.66      0.66       102

    accuracy                           0.63       188
   macro avg       0.63      0.63      0.63       188
weighted avg       0.63      0.63      0.63       188



## **Stochastic Gradient Descent**

In [32]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

           0       0.69      0.44      0.54        86
           1       0.64      0.83      0.72       102

    accuracy                           0.65       188
   macro avg       0.67      0.64      0.63       188
weighted avg       0.66      0.65      0.64       188



## **K Nearest Neighbours**

In [33]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

           0       0.67      0.58      0.62        86
           1       0.68      0.75      0.72       102

    accuracy                           0.68       188
   macro avg       0.67      0.67      0.67       188
weighted avg       0.67      0.68      0.67       188



## **Decision Tree**

In [34]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60        86
           1       0.67      0.67      0.67       102

    accuracy                           0.64       188
   macro avg       0.64      0.64      0.64       188
weighted avg       0.64      0.64      0.64       188



## **Random Forest**

In [35]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74        86
           1       0.78      0.79      0.79       102

    accuracy                           0.77       188
   macro avg       0.76      0.76      0.76       188
weighted avg       0.77      0.77      0.77       188



## **Majority Voting**

In [36]:
mv_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    # Including KNN, SGD & RF in predictions
    predictions = [knn_pred[i], sgd_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: mv_pred.append(1)
    else: mv_pred.append(0)

mv_pred = np.array(mv_pred)

print(classification_report(y_val, mv_pred))

              precision    recall  f1-score   support

           0       0.78      0.62      0.69        86
           1       0.72      0.85      0.78       102

    accuracy                           0.74       188
   macro avg       0.75      0.73      0.74       188
weighted avg       0.75      0.74      0.74       188

