## **Setting up Development Environment**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/HL

/content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/HL


In [3]:
! pip install -U -q transformers
! pip install -U -q sentencepiece

[K     |████████████████████████████████| 2.9 MB 9.6 MB/s 
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
[K     |████████████████████████████████| 636 kB 56.3 MB/s 
[K     |████████████████████████████████| 895 kB 51.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 42.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 6.6 MB/s 
[?25h

## **Importing Libraries**

In [4]:
import pandas as pd

import numpy as np

import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from transformers import BertModel, BertTokenizer

import pickle

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Loading Data**

In [5]:
dataframe = pd.read_csv("Dataset/Tamil-Codemixed_offensive_data_Training-Tweet-HL.csv")

In [6]:
dataframe.head()

Unnamed: 0,id,text,category
0,TA_HL100,Iyaooo Kovam pattutene sothula visatha vachuru...,NOT
1,TA_HL101,@Asha Apo neenga atha government ku theriya pa...,NOT
2,TA_HL102,@Bala sundar ayyo sorry...antha line ah explai...,NOT
3,TA_HL105,@kalimuthu ne ena lusa...yaaru edhu panaalum e...,NOT
4,TA_HL109,1st baby ku neat ah feed panunga plzz ipdi iru...,NOT


In [7]:
dataframe.describe()

Unnamed: 0,id,text,category
count,4000,4000,4000
unique,3999,3657,2
top,TA_TW1600,@USER Mafia Mari yamthirathinga making vice ve...,NOT
freq,2,2,2020


In [8]:
dataframe['category'].unique()

array(['NOT', 'OFF'], dtype=object)

In [9]:
dataframe.drop_duplicates(subset ="id",keep = False, inplace = True)
dataframe.describe()

Unnamed: 0,id,text,category
count,3998,3998,3998
unique,3998,3656,2
top,TA_TW1684,@USER Watch Reeves's training videos bro. The ...,NOT
freq,1,2,2020


In [10]:
dataframe.shape

(3998, 3)

In [11]:
text = dataframe['text']
text

0       Iyaooo Kovam pattutene sothula visatha vachuru...
1       @Asha Apo neenga atha government ku theriya pa...
2       @Bala sundar ayyo sorry...antha line ah explai...
3       @kalimuthu ne ena lusa...yaaru edhu panaalum e...
4       1st baby ku neat ah feed panunga plzz ipdi iru...
                              ...                        
3995    Yaroda body structure semaya irukum? Sema mood...
3996    Yenda naangala politics varom nu pala varusham...
3997    Yepdithan seruppala adichalum arasiyalvathikku...
3998    @USER Paithiyam ena unga vanthu full ah forwar...
3999    RT @USER : Itha vidaa kevalam veraa irukaa vij...
Name: text, Length: 3998, dtype: object

In [12]:
label = dataframe['category']
label

0       NOT
1       NOT
2       NOT
3       NOT
4       NOT
       ... 
3995    OFF
3996    OFF
3997    OFF
3998    OFF
3999    OFF
Name: category, Length: 3998, dtype: object

## **Data Preprocessing**

In [13]:
le = LabelEncoder()
label = le.fit_transform(label)

In [14]:
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+",' ', text)
    text = re.sub(r"#[A-Za-z0-9]+",' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+",' ', text)
    text = re.sub("RT ", " ", text)
    text = re.sub("\n", " ", text)
    text = re.sub(r" +", " ", text)
    tokens = []
    for token in text.split():
        if token not in english_stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

text = [clean_text(i) for i in text]

text = np.array(text)

## **Feature Extraction**

In [15]:
# Load Transformer Model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Sample Output

tokenized_input = tokenizer(
        text[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [17]:
print(tokenized_input)

{'input_ids': tensor([[  101,  1045,  3148,  9541,  2080, 12849,  3567,  2213,  6986, 24518,
          2078,  2061,  2705,  7068,  9425,  8322, 12436, 20760,  6820,  6212,
         16377,  2140,  1012,  1012, 22834,  9541,  1012,  1012,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]])}


In [18]:
print(sample_output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.4045,  0.1203,  0.1953,  ..., -0.3334,  0.7986,  0.5435],
         [-0.1979,  0.1614, -0.3499,  ..., -0.4713,  0.7413,  0.8708],
         [ 0.1189,  1.1128,  0.4292,  ..., -0.0727,  0.1078,  0.5697],
         ...,
         [ 0.3759, -0.8751,  0.9405,  ...,  0.3391,  0.9038, -0.3161],
         [ 0.1426, -0.6380,  0.3954,  ...,  0.4370,  0.5188, -0.1416],
         [ 0.9103,  0.0848,  0.1497,  ..., -0.1298, -0.3478, -0.0284]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.9020, -0.5766, -0.9871,  0.9061,  0.8984, -0.2459,  0.9476,  0.4265,
         -0.9627, -1.0000, -0.7585,  0.9694,  0.9671,  0.8646,  0.9423, -0.8508,
         -0.4874, -0.6445,  0.4881, -0.6217,  0.8461,  1.0000, -0.3983,  0.4419,
          0.6120,  0.9979, -0.8146,  0.9263,  0.9466,  0.6439, -0.8329,  0.2605,
         -0.9825, -0.3722, -0.9916, -0.9873,  0.5257, -0.6695, -0.3626, -0.0141,
         -0.8969,  0.3781,  1.000

In [19]:
print(sample_output.pooler_output.cpu().detach().numpy().shape)

(1, 768)


In [20]:
# Tokenizing Input Data

input = []

for i in text:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [21]:
# Embedding the Input Data

output = []

j = 1

for i in range(len(input)):
    
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    
    # Saving Embedded Input Data to Disk

    if ((i + 1) % 200) == 0:
        
        file_name = "./MaskedLM-Embedded-TrainData/output" + str(j) + ".pickle"
        
        with open(file_name, "wb") as fp:   
            # Pickling
            pickle.dump(output, fp)
        
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./MaskedLM-Embedded-TrainData/output" + str(j) + ".pickle"

with open(file_name, "wb") as fp:   
    # Pickling
    pickle.dump(output, fp)

print(file_name + " done")
output = []

./MaskedLM-Embedded-TrainData/output1.pickle done
./MaskedLM-Embedded-TrainData/output2.pickle done
./MaskedLM-Embedded-TrainData/output3.pickle done
./MaskedLM-Embedded-TrainData/output4.pickle done
./MaskedLM-Embedded-TrainData/output5.pickle done
./MaskedLM-Embedded-TrainData/output6.pickle done
./MaskedLM-Embedded-TrainData/output7.pickle done
./MaskedLM-Embedded-TrainData/output8.pickle done
./MaskedLM-Embedded-TrainData/output9.pickle done
./MaskedLM-Embedded-TrainData/output10.pickle done
./MaskedLM-Embedded-TrainData/output11.pickle done
./MaskedLM-Embedded-TrainData/output12.pickle done
./MaskedLM-Embedded-TrainData/output13.pickle done
./MaskedLM-Embedded-TrainData/output14.pickle done
./MaskedLM-Embedded-TrainData/output15.pickle done
./MaskedLM-Embedded-TrainData/output16.pickle done
./MaskedLM-Embedded-TrainData/output17.pickle done
./MaskedLM-Embedded-TrainData/output18.pickle done
./MaskedLM-Embedded-TrainData/output19.pickle done
./MaskedLM-Embedded-TrainData/output20.p

In [22]:
# Loading Embedded Input Data from Disk

output = []

for i in range(20):
    file_name = "./MaskedLM-Embedded-TrainData/output" + str(i + 1) + ".pickle"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

X = output
output = []

./MaskedLM-Embedded-TrainData/output1.pickle done
./MaskedLM-Embedded-TrainData/output2.pickle done
./MaskedLM-Embedded-TrainData/output3.pickle done
./MaskedLM-Embedded-TrainData/output4.pickle done
./MaskedLM-Embedded-TrainData/output5.pickle done
./MaskedLM-Embedded-TrainData/output6.pickle done
./MaskedLM-Embedded-TrainData/output7.pickle done
./MaskedLM-Embedded-TrainData/output8.pickle done
./MaskedLM-Embedded-TrainData/output9.pickle done
./MaskedLM-Embedded-TrainData/output10.pickle done
./MaskedLM-Embedded-TrainData/output11.pickle done
./MaskedLM-Embedded-TrainData/output12.pickle done
./MaskedLM-Embedded-TrainData/output13.pickle done
./MaskedLM-Embedded-TrainData/output14.pickle done
./MaskedLM-Embedded-TrainData/output15.pickle done
./MaskedLM-Embedded-TrainData/output16.pickle done
./MaskedLM-Embedded-TrainData/output17.pickle done
./MaskedLM-Embedded-TrainData/output18.pickle done
./MaskedLM-Embedded-TrainData/output19.pickle done
./MaskedLM-Embedded-TrainData/output20.p

In [23]:
X = np.array(X)
X.shape

(3998, 1, 768)

In [24]:
X = X.reshape(3998, 768)
X.shape

(3998, 768)

In [25]:
y = label
y.shape

(3998,)

## **Train Test Split**

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
X_train.shape

(3198, 768)

In [28]:
X_val.shape

(800, 768)

## **Logistic Regression**

In [36]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.77      0.71      0.74       420
           1       0.71      0.76      0.73       380

    accuracy                           0.73       800
   macro avg       0.74      0.74      0.73       800
weighted avg       0.74      0.73      0.74       800



## **Support Vector Machine**

In [30]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print(classification_report(y_val, svm_pred))

              precision    recall  f1-score   support

           0       0.64      0.52      0.57       420
           1       0.56      0.68      0.62       380

    accuracy                           0.60       800
   macro avg       0.60      0.60      0.60       800
weighted avg       0.60      0.60      0.59       800



## **Naive Bayes**

In [31]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

           0       0.67      0.07      0.13       420
           1       0.48      0.96      0.64       380

    accuracy                           0.49       800
   macro avg       0.58      0.52      0.39       800
weighted avg       0.58      0.49      0.37       800



## **Stochastic Gradient Descent**

In [32]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

           0       0.77      0.69      0.72       420
           1       0.69      0.77      0.73       380

    accuracy                           0.73       800
   macro avg       0.73      0.73      0.72       800
weighted avg       0.73      0.72      0.72       800



## **K Nearest Neighbours**

In [33]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

           0       0.65      0.58      0.61       420
           1       0.58      0.66      0.62       380

    accuracy                           0.61       800
   macro avg       0.62      0.62      0.61       800
weighted avg       0.62      0.61      0.61       800



## **Decision Tree**

In [34]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.63      0.64      0.63       420
           1       0.59      0.58      0.59       380

    accuracy                           0.61       800
   macro avg       0.61      0.61      0.61       800
weighted avg       0.61      0.61      0.61       800



## **Random Forest**

In [35]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68       420
           1       0.65      0.71      0.68       380

    accuracy                           0.68       800
   macro avg       0.68      0.68      0.68       800
weighted avg       0.68      0.68      0.68       800



## **Majority Voting**

In [37]:
mv_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    # Including only LR, SGD & RF in the predictions
    predictions = [lr_pred[i], rf_pred[i], sgd_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: mv_pred.append(1)
    else: mv_pred.append(0)

mv_pred = np.array(mv_pred)

print(classification_report(y_val, mv_pred))

              precision    recall  f1-score   support

           0       0.77      0.70      0.73       420
           1       0.70      0.77      0.73       380

    accuracy                           0.73       800
   macro avg       0.73      0.73      0.73       800
weighted avg       0.73      0.73      0.73       800

