## **Setting up the Development Environment**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Task 2

/content/drive/MyDrive/Colab Notebooks/Hate-Speech-Detection-in-Tamil/HASOC Tamil/Task 2


## **Importing Libraries**

In [None]:
import pandas as pd

import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Loading Data**

In [None]:
dataframe = pd.read_csv("Dataset/hasoc_tamil_task2_withlabels.tsv", sep='\t')

In [None]:
dataframe.head()

Unnamed: 0,id,text,category
0,Tam_1,Indha movie ku award tharlana avanga mansanay ...,OFF
1,Tam_2,kritheeck Kookaburra en unaku enachu? Cbsc ah??,NOT
2,Tam_3,Actually Oru particular bus incident thalaiya ...,OFF
3,Tam_4,Small suggestions: mic ah shirt la pottukunga bro,NOT
4,Tam_5,Karnan padathulaa oru pombaa varumlaa athuu en...,NOT


In [None]:
dataframe.describe()

Unnamed: 0,id,text,category
count,1001,1001,1000
unique,1001,994,2
top,Tam_561,Indha movie ku award tharlana avanga mansanay ...,NOT
freq,1,3,605


In [None]:
dataframe['category'].unique()

array(['OFF', 'NOT', nan], dtype=object)

In [None]:
dataframe.shape

(1001, 3)

In [None]:
dataframe = dataframe.dropna()
dataframe['category'].unique()

array(['OFF', 'NOT'], dtype=object)

In [None]:
dataframe.shape

(1000, 3)

In [None]:
text = dataframe['text']
text

0       Indha movie ku award tharlana avanga mansanay ...
1         kritheeck Kookaburra en unaku enachu? Cbsc ah??
2       Actually Oru particular bus incident thalaiya ...
3       Small suggestions: mic ah shirt la pottukunga bro
4       Karnan padathulaa oru pombaa varumlaa athuu en...
                              ...                        
996     Chai spoiler vaya mooduya full kathayum sillit...
997     Unakku thinga soru irukko illayo aduthavangala...
998     Dai seripu oli advingi mala vandavan thana mau...
999     Gomala bule shirt ta nee padam eduthu paaruda ...
1000    Mr. Maran neenga mindvoice nu nenachu sound ah...
Name: text, Length: 1000, dtype: object

In [None]:
label = dataframe['category']
label

0       OFF
1       NOT
2       OFF
3       NOT
4       NOT
       ... 
996     OFF
997     OFF
998     OFF
999     OFF
1000    NOT
Name: category, Length: 1000, dtype: object

## **Data Preprocessing**

In [None]:
le = LabelEncoder()
label = le.fit_transform(label)
# label

In [None]:
text = text.str.replace(r"[+/#@&*$%:]",'')
text = text.to_numpy()
# text

In [None]:
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

def clean_text(text):
    tokens = []
    for token in text.split():
        if token not in english_stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

text = [clean_text(i) for i in text]

text = np.array(text)

## **Feature Extraction**

In [None]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(text)
X = X.todense()

In [None]:
X.shape

(1000, 367)

In [None]:
y = label
y.shape

(1000,)

## **Train Test Split**

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(800, 367)

In [None]:
X_val.shape

(200, 367)

## **Logistic Regression**

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

           0       0.68      0.84      0.75       125
           1       0.56      0.33      0.42        75

    accuracy                           0.65       200
   macro avg       0.62      0.59      0.58       200
weighted avg       0.63      0.65      0.62       200



## **Support Vector Machine**

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_val)
print(classification_report(y_val, svm_pred))

              precision    recall  f1-score   support

           0       0.67      0.83      0.74       125
           1       0.53      0.32      0.40        75

    accuracy                           0.64       200
   macro avg       0.60      0.58      0.57       200
weighted avg       0.62      0.64      0.61       200



## **Naive Bayes**

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

           0       0.75      0.34      0.47       125
           1       0.43      0.81      0.56        75

    accuracy                           0.52       200
   macro avg       0.59      0.58      0.52       200
weighted avg       0.63      0.52      0.51       200



## **Stochastic Gradient Descent**

In [None]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

           0       0.68      0.73      0.71       125
           1       0.49      0.44      0.46        75

    accuracy                           0.62       200
   macro avg       0.59      0.58      0.59       200
weighted avg       0.61      0.62      0.62       200



## **K Nearest Neighbours**

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

           0       0.61      0.66      0.63       125
           1       0.34      0.29      0.31        75

    accuracy                           0.52       200
   macro avg       0.47      0.47      0.47       200
weighted avg       0.51      0.52      0.51       200



## **Decision Tree**

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

           0       0.70      0.75      0.73       125
           1       0.53      0.47      0.50        75

    accuracy                           0.65       200
   macro avg       0.62      0.61      0.61       200
weighted avg       0.64      0.65      0.64       200



## **Random Forest**

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

           0       0.70      0.76      0.73       125
           1       0.53      0.45      0.49        75

    accuracy                           0.65       200
   macro avg       0.61      0.61      0.61       200
weighted avg       0.64      0.65      0.64       200



## **Majority Voting**

In [None]:
mv_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    # Removed KNN & NB from predictions due poor performance
    predictions = [lr_pred[i], svm_pred[i], rf_pred[i], sgd_pred[i], dt_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: mv_pred.append(1)
    else: mv_pred.append(0)

mv_pred = np.array(mv_pred)

print(classification_report(y_val, mv_pred))

              precision    recall  f1-score   support

           0       0.69      0.82      0.75       125
           1       0.56      0.39      0.46        75

    accuracy                           0.66       200
   macro avg       0.62      0.60      0.60       200
weighted avg       0.64      0.66      0.64       200

