## **Setting up Development Environment**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/HL

/content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/HL


## **Importing Libraries**

In [15]:
import pandas as pd

import numpy as np

import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Loading Data**

In [4]:
dataframe = pd.read_csv("Dataset/Tamil-Codemixed_offensive_data_Training-Tweet-HL.csv")

In [5]:
dataframe.head()

Unnamed: 0,id,text,category
0,TA_HL100,Iyaooo Kovam pattutene sothula visatha vachuru...,NOT
1,TA_HL101,@Asha Apo neenga atha government ku theriya pa...,NOT
2,TA_HL102,@Bala sundar ayyo sorry...antha line ah explai...,NOT
3,TA_HL105,@kalimuthu ne ena lusa...yaaru edhu panaalum e...,NOT
4,TA_HL109,1st baby ku neat ah feed panunga plzz ipdi iru...,NOT


In [6]:
dataframe.describe()

Unnamed: 0,id,text,category
count,4000,4000,4000
unique,3999,3657,2
top,TA_TW1600,@USER Anna bf naa best frd anna thappaana arth...,NOT
freq,2,2,2020


In [7]:
dataframe['category'].unique()

array(['NOT', 'OFF'], dtype=object)

In [10]:
dataframe.drop_duplicates(subset ="id",keep = False, inplace = True)
dataframe.describe()

Unnamed: 0,id,text,category
count,3998,3998,3998
unique,3998,3656,2
top,TA_TW1620,@USER Romba days Apram TL la liya pic Varuthu ...,NOT
freq,1,2,2020


In [11]:
dataframe.shape

(3998, 3)

In [19]:
text = dataframe['text']
text

0       Iyaooo Kovam pattutene sothula visatha vachuru...
1       @Asha Apo neenga atha government ku theriya pa...
2       @Bala sundar ayyo sorry...antha line ah explai...
3       @kalimuthu ne ena lusa...yaaru edhu panaalum e...
4       1st baby ku neat ah feed panunga plzz ipdi iru...
                              ...                        
3995    Yaroda body structure semaya irukum? Sema mood...
3996    Yenda naangala politics varom nu pala varusham...
3997    Yepdithan seruppala adichalum arasiyalvathikku...
3998    @USER Paithiyam ena unga vanthu full ah forwar...
3999    RT @USER : Itha vidaa kevalam veraa irukaa vij...
Name: text, Length: 3998, dtype: object

In [13]:
label = dataframe['category']
label

0       NOT
1       NOT
2       NOT
3       NOT
4       NOT
       ... 
3995    OFF
3996    OFF
3997    OFF
3998    OFF
3999    OFF
Name: category, Length: 3998, dtype: object

## **Data Preprocessing**

In [14]:
le = LabelEncoder()
label = le.fit_transform(label)

In [20]:
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+",' ', text)
    text = re.sub(r"#[A-Za-z0-9]+",' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+",' ', text)
    text = re.sub("RT ", " ", text)
    text = re.sub("\n", " ", text)
    text = re.sub(r" +", " ", text)
    tokens = []
    for token in text.split():
        if token not in english_stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

text = [clean_text(i) for i in text]

text = np.array(text)

## **Feature Extraction**

In [22]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(text)
X = X.todense()

In [23]:
X.shape

(3998, 2092)

In [24]:
y = label
y.shape

(3998,)

## **Train Test Split**

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X_train.shape

(3198, 2092)

In [27]:
X_val.shape

(800, 2092)

## **Logistic Regression**

In [28]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       420
           1       0.82      0.83      0.83       380

    accuracy                           0.84       800
   macro avg       0.84      0.84      0.84       800
weighted avg       0.84      0.84      0.84       800



## **Support Vector Machine**

In [29]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print(classification_report(y_val, svm_pred))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85       420
           1       0.82      0.85      0.84       380

    accuracy                           0.84       800
   macro avg       0.84      0.84      0.84       800
weighted avg       0.84      0.84      0.84       800



## **Naive Bayes**

In [30]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

           0       0.80      0.81      0.81       420
           1       0.79      0.77      0.78       380

    accuracy                           0.80       800
   macro avg       0.79      0.79      0.79       800
weighted avg       0.79      0.80      0.79       800



## **Stochastic Gradient Descent**

In [31]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

           0       0.86      0.79      0.82       420
           1       0.78      0.86      0.82       380

    accuracy                           0.82       800
   macro avg       0.82      0.82      0.82       800
weighted avg       0.82      0.82      0.82       800



## **K Nearest Neighbours**

In [32]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

           0       0.56      0.45      0.50       420
           1       0.50      0.61      0.55       380

    accuracy                           0.52       800
   macro avg       0.53      0.53      0.52       800
weighted avg       0.53      0.52      0.52       800



## **Decision Tree**

In [33]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.81      0.75      0.78       420
           1       0.75      0.81      0.78       380

    accuracy                           0.78       800
   macro avg       0.78      0.78      0.78       800
weighted avg       0.78      0.78      0.78       800



## **Random Forest**

In [34]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83       420
           1       0.81      0.83      0.82       380

    accuracy                           0.83       800
   macro avg       0.83      0.83      0.83       800
weighted avg       0.83      0.83      0.83       800



## **Majority Voting**

In [35]:
mv_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    # Removed KNN & DT from predictions due poor performance
    predictions = [lr_pred[i], svm_pred[i], rf_pred[i], sgd_pred[i], nb_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: mv_pred.append(1)
    else: mv_pred.append(0)

mv_pred = np.array(mv_pred)

print(classification_report(y_val, mv_pred))

              precision    recall  f1-score   support

           0       0.87      0.84      0.85       420
           1       0.83      0.86      0.84       380

    accuracy                           0.85       800
   macro avg       0.85      0.85      0.85       800
weighted avg       0.85      0.85      0.85       800

