## **Setting up the Development Environment**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Task 2

/content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Task 2


In [3]:
! pip install -U -q transformers
! pip install -U -q sentencepiece

[K     |████████████████████████████████| 2.9 MB 5.0 MB/s 
[K     |████████████████████████████████| 895 kB 43.5 MB/s 
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 42.4 MB/s 
[K     |████████████████████████████████| 636 kB 57.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 5.1 MB/s 
[?25h

## **Importing Libraries**

In [4]:
import pandas as pd

import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from transformers import BertModel, BertTokenizer

import pickle

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Loading Data**

In [5]:
dataframe = pd.read_csv("Dataset/hasoc_tamil_task2_withlabels.tsv", sep='\t')

In [6]:
dataframe.head()

Unnamed: 0,id,text,category
0,Tam_1,Indha movie ku award tharlana avanga mansanay ...,OFF
1,Tam_2,kritheeck Kookaburra en unaku enachu? Cbsc ah??,NOT
2,Tam_3,Actually Oru particular bus incident thalaiya ...,OFF
3,Tam_4,Small suggestions: mic ah shirt la pottukunga bro,NOT
4,Tam_5,Karnan padathulaa oru pombaa varumlaa athuu en...,NOT


In [7]:
dataframe.describe()

Unnamed: 0,id,text,category
count,1001,1001,1000
unique,1001,994,2
top,Tam_775,Indha movie ku award tharlana avanga mansanay ...,NOT
freq,1,3,605


In [8]:
dataframe['category'].unique()

array(['OFF', 'NOT', nan], dtype=object)

In [9]:
dataframe.shape

(1001, 3)

In [10]:
dataframe = dataframe.dropna()
dataframe['category'].unique()

array(['OFF', 'NOT'], dtype=object)

In [11]:
dataframe.shape

(1000, 3)

In [12]:
text = dataframe['text']
text

0       Indha movie ku award tharlana avanga mansanay ...
1         kritheeck Kookaburra en unaku enachu? Cbsc ah??
2       Actually Oru particular bus incident thalaiya ...
3       Small suggestions: mic ah shirt la pottukunga bro
4       Karnan padathulaa oru pombaa varumlaa athuu en...
                              ...                        
996     Chai spoiler vaya mooduya full kathayum sillit...
997     Unakku thinga soru irukko illayo aduthavangala...
998     Dai seripu oli advingi mala vandavan thana mau...
999     Gomala bule shirt ta nee padam eduthu paaruda ...
1000    Mr. Maran neenga mindvoice nu nenachu sound ah...
Name: text, Length: 1000, dtype: object

In [13]:
label = dataframe['category']
label

0       OFF
1       NOT
2       OFF
3       NOT
4       NOT
       ... 
996     OFF
997     OFF
998     OFF
999     OFF
1000    NOT
Name: category, Length: 1000, dtype: object

## **Data Preprocessing**

In [14]:
le = LabelEncoder()
label = le.fit_transform(label)
# label

In [15]:
text = text.str.replace(r"[+/#@&*$%:]",'')
text = text.to_numpy()
# text

In [16]:
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

def clean_text(text):
    tokens = []
    for token in text.split():
        if token not in english_stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

text = [clean_text(i) for i in text]

text = np.array(text)

## **Feature Extraction**

In [17]:
# Load Transformer Model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# Sample Output

tokenized_input = tokenizer(
        text[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [19]:
print(tokenized_input)

{'input_ids': tensor([[  101, 27427,  3270,  9587,  5737, 13970,  2400, 22794, 12190,  5162,
         10927, 13807, 16042,  5162,  2100,  5665,  2050, 22953,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [20]:
print(sample_output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.7979, -0.0745, -0.1645,  ..., -0.0672,  0.1631,  0.3390],
         [ 0.5946, -0.2868, -0.5829,  ...,  0.0485,  0.9181,  0.7094],
         [-0.2138, -0.1629, -1.0407,  ..., -0.2560, -0.0678,  0.1336],
         ...,
         [-0.6720, -0.1168, -0.7200,  ..., -0.3554,  0.0626,  0.3006],
         [ 0.2998, -0.6461, -0.1986,  ...,  0.2716,  0.2005,  0.2846],
         [ 0.6649,  0.1737, -0.0352,  ...,  0.1816, -0.4609, -0.1850]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-7.8382e-01, -4.1877e-01, -5.5664e-01,  6.5911e-01,  2.9378e-01,
         -3.4248e-03,  7.4705e-01,  2.6147e-01, -1.9658e-01, -9.9994e-01,
         -1.5159e-01,  4.8868e-01,  9.6995e-01,  2.1936e-01,  8.0648e-01,
         -4.6208e-01, -2.7141e-02, -5.9307e-01,  4.6765e-01, -3.0404e-01,
          5.8733e-01,  9.9908e-01,  2.2218e-01,  2.2504e-01,  3.9264e-01,
          7.7983e-01, -6.1656e-01,  8.4127e-01,  9.2879e-01,  6.9300

In [21]:
print(sample_output.pooler_output.cpu().detach().numpy().shape)

(1, 768)


In [22]:
# Tokenizing Input Data

input = []

for i in text:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [23]:
# Embedding the Input Data

output = []

j = 1

for i in range(len(input)):
    
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    
    # Saving Embedded Input Data to Disk

    if ((i + 1) % 200) == 0:
        
        file_name = "./MaskedLM-Embedded-TrainData/output" + str(j) + ".pickle"
        
        with open(file_name, "wb") as fp:   
            # Pickling
            pickle.dump(output, fp)
        
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./MaskedLM-Embedded-TrainData/output" + str(j) + ".pickle"

with open(file_name, "wb") as fp:   
    # Pickling
    pickle.dump(output, fp)

print(file_name + " done")
output = []

./MaskedLM-Embedded-TrainData/output1.pickle done
./MaskedLM-Embedded-TrainData/output2.pickle done
./MaskedLM-Embedded-TrainData/output3.pickle done
./MaskedLM-Embedded-TrainData/output4.pickle done
./MaskedLM-Embedded-TrainData/output5.pickle done
./MaskedLM-Embedded-TrainData/output6.pickle done


In [48]:
# Loading Embedded Input Data from Disk

output = []

for i in range(6):
    file_name = "./MaskedLM-Embedded-TrainData/output" + str(i + 1) + ".pickle"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

X = output
output = []

./MaskedLM-Embedded-TrainData/output1.pickle done
./MaskedLM-Embedded-TrainData/output2.pickle done
./MaskedLM-Embedded-TrainData/output3.pickle done
./MaskedLM-Embedded-TrainData/output4.pickle done
./MaskedLM-Embedded-TrainData/output5.pickle done
./MaskedLM-Embedded-TrainData/output6.pickle done


In [49]:
X = np.array(X)
X.shape

(1000, 1, 768)

In [50]:
X = X.reshape(1000, 768)
X.shape

(1000, 768)

In [51]:
y = label
y.shape

(1000,)

## **Train Test Split**

In [52]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
X_train.shape

(800, 768)

In [54]:
X_val.shape

(200, 768)

## **Logistic Regression**

In [55]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.69      0.82      0.75       125
           1       0.56      0.37      0.45        75

    accuracy                           0.66       200
   macro avg       0.62      0.60      0.60       200
weighted avg       0.64      0.66      0.64       200



## **Support Vector Machine**

In [56]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print(classification_report(y_val, svm_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.77       125
           1       0.00      0.00      0.00        75

    accuracy                           0.62       200
   macro avg       0.31      0.50      0.38       200
weighted avg       0.39      0.62      0.48       200



  _warn_prf(average, modifier, msg_start, len(result))


## **Naive Bayes**

In [57]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

           0       0.75      0.36      0.49       125
           1       0.43      0.80      0.56        75

    accuracy                           0.53       200
   macro avg       0.59      0.58      0.52       200
weighted avg       0.63      0.53      0.51       200



## **Stochastic Gradient Descent**

In [58]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

           0       0.63      1.00      0.77       125
           1       1.00      0.03      0.05        75

    accuracy                           0.64       200
   macro avg       0.82      0.51      0.41       200
weighted avg       0.77      0.64      0.50       200



## **K Nearest Neighbours**

In [59]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

           0       0.64      0.70      0.67       125
           1       0.42      0.36      0.39        75

    accuracy                           0.57       200
   macro avg       0.53      0.53      0.53       200
weighted avg       0.56      0.57      0.56       200



## **Decision Tree**

In [60]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.64      0.60      0.62       125
           1       0.39      0.43      0.41        75

    accuracy                           0.54       200
   macro avg       0.51      0.51      0.51       200
weighted avg       0.54      0.54      0.54       200



## **Random Forest**

In [61]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.65      0.82      0.73       125
           1       0.48      0.28      0.35        75

    accuracy                           0.61       200
   macro avg       0.57      0.55      0.54       200
weighted avg       0.59      0.61      0.59       200



## **Majority Voting**

In [62]:
mv_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    # Only having RF, LR & SGD in predictions 
    # due poor performance of other models
    predictions = [lr_pred[i], sgd_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: mv_pred.append(1)
    else: mv_pred.append(0)

mv_pred = np.array(mv_pred)

print(classification_report(y_val, mv_pred))

              precision    recall  f1-score   support

           0       0.66      0.94      0.77       125
           1       0.65      0.17      0.27        75

    accuracy                           0.66       200
   macro avg       0.65      0.56      0.52       200
weighted avg       0.65      0.66      0.59       200

