## **Setting up Development Environment**

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
cd /content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Tanglish

/content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Tanglish


## **Importing Libraries**

In [17]:
import pandas as pd

import numpy as np

import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Loading Data**

In [18]:
dataframe = pd.read_csv("Dataset/Tamil_hasoc_tanglish_test_withlabels.tsv", sep='\t')

In [19]:
dataframe.head()

Unnamed: 0,id,text,category
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...,OFF
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...,NOT
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka...",OFF
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...,OFF
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...,OFF


In [20]:
dataframe.describe()

Unnamed: 0,id,text,category
count,940,940,940
unique,939,933,2
top,TA_TW13798,RT : Full Day Time Iruku !! 1 person 700 Twee...,OFF
freq,2,5,475


In [21]:
dataframe['category'].unique()

array(['OFF', 'NOT', nan], dtype=object)

In [22]:
dataframe.drop_duplicates(subset ="id",keep = False, inplace = True)
dataframe.dropna()
dataframe.describe()

Unnamed: 0,id,text,category
count,938,938,938
unique,938,932,2
top,TA_TW9024,RT : Full Day Time Iruku !! 1 person 700 Twee...,OFF
freq,1,5,473


In [23]:
dataframe['category'].unique()

array(['OFF', 'NOT'], dtype=object)

In [24]:
text = dataframe['text']
text

0      Take it this thevidiya Kandipa indha page admi...
1      enga veetla itha nadakum Athum oru varushama t...
2      Indha Sallli Punda, Dummy Pundalam Orama Iruka...
3      Juriya poola tier 1 la umburan tha kulla punda...
4      Kullans lam umba therila Loosu kuthi maari umb...
                             ...                        
934    Watha  manasaatchi punda irukka daa unakuu... ...
935    Woman with a bhindi picture on a logo promotes...
936    Worst ra Vara vara Namma Society kevalama poit...
937    Yeallarukum reply pandringa namma #TAG RakidaR...
938    Yeva yeva valkaiyelam flashlight adicha maari ...
Name: text, Length: 938, dtype: object

In [25]:
label = dataframe['category']
label

0      OFF
1      NOT
2      OFF
3      OFF
4      OFF
      ... 
934    OFF
935    OFF
936    OFF
937    NOT
938    NOT
Name: category, Length: 938, dtype: object

## **Data Preprocessing**

In [26]:
le = LabelEncoder()
label = le.fit_transform(label)

In [27]:
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+",' ', text)
    text = re.sub(r"#[A-Za-z0-9]+",' ', text)
    text = re.sub(r": +",' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+",' ', text)
    text = re.sub("RT ", " ", text)
    text = re.sub("\n", " ", text)
    text = re.sub(r" +", " ", text)
    tokens = []
    for token in text.split():
        if token not in english_stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

text = [clean_text(i) for i in text]

text = np.array(text)

## **Feature Extraction**

In [29]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(text)
X = X.todense()

In [30]:
X.shape

(938, 419)

In [31]:
y = label
y.shape

(938,)

## **Test Train Split**

In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
X_train.shape

(750, 419)

In [34]:
X_val.shape

(188, 419)

## **Logistic Regression**

In [35]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92        86
           1       0.97      0.89      0.93       102

    accuracy                           0.93       188
   macro avg       0.93      0.93      0.93       188
weighted avg       0.93      0.93      0.93       188



## **Support Vector Machine**

In [36]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print(classification_report(y_val, svm_pred))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93        86
           1       0.98      0.89      0.93       102

    accuracy                           0.93       188
   macro avg       0.93      0.93      0.93       188
weighted avg       0.94      0.93      0.93       188



## **Naive Bayes**

In [37]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

           0       0.78      0.87      0.82        86
           1       0.88      0.79      0.84       102

    accuracy                           0.83       188
   macro avg       0.83      0.83      0.83       188
weighted avg       0.84      0.83      0.83       188



## **Stochastic Gradient Descent**

In [38]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88        86
           1       0.89      0.92      0.90       102

    accuracy                           0.89       188
   macro avg       0.89      0.89      0.89       188
weighted avg       0.89      0.89      0.89       188



## **K Nearest Neighbours**

In [39]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

           0       0.70      0.93      0.80        86
           1       0.92      0.66      0.77       102

    accuracy                           0.78       188
   macro avg       0.81      0.79      0.78       188
weighted avg       0.82      0.78      0.78       188



## **Decision Tree**

In [40]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92        86
           1       0.92      0.95      0.93       102

    accuracy                           0.93       188
   macro avg       0.93      0.92      0.92       188
weighted avg       0.93      0.93      0.93       188



## **Random Forest**

In [41]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94        86
           1       0.99      0.91      0.95       102

    accuracy                           0.95       188
   macro avg       0.95      0.95      0.95       188
weighted avg       0.95      0.95      0.95       188



## **Majority Voting**

In [43]:
mv_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    # Removed KNN & NB from predictions due poor performance
    predictions = [lr_pred[i], svm_pred[i], rf_pred[i], sgd_pred[i], dt_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: mv_pred.append(1)
    else: mv_pred.append(0)

mv_pred = np.array(mv_pred)

print(classification_report(y_val, mv_pred))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94        86
           1       0.98      0.91      0.94       102

    accuracy                           0.94       188
   macro avg       0.94      0.94      0.94       188
weighted avg       0.94      0.94      0.94       188

