## **Environment Setup**

In [1]:
%cd /content/drive/MyDrive/Colab Notebooks/HASOC2021-EnglishHindiMarathi/

/content/drive/MyDrive/Colab Notebooks/HASOC2021-EnglishHindiMarathi


In [2]:
! pip install -q torch
! pip install -q transformers
! pip install -q inltk

[K     |████████████████████████████████| 2.8 MB 5.4 MB/s 
[K     |████████████████████████████████| 636 kB 48.6 MB/s 
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
[K     |████████████████████████████████| 895 kB 43.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 23.8 MB/s 
[K     |████████████████████████████████| 233 kB 5.3 MB/s 
[K     |████████████████████████████████| 78 kB 7.9 MB/s 
[K     |████████████████████████████████| 1.3 MB 51.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 46.1 MB/s 
[K     |████████████████████████████████| 142 kB 57.2 MB/s 
[K     |████████████████████████████████| 294 kB 50.0 MB/s 
[?25h  Building wheel for typing (setup.py) ... [?25l[?25hdone


## **Importing Libraries**

In [35]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout,BatchNormalization

import torch

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import stemmer as hindi_stemmer

In [69]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [70]:
english_stopwords = stopwords.words("english")
with open('marathi_stopwords.txt', encoding = 'utf-8') as f:
    marathi_stopwords = f.readlines()
    for i in range(len(marathi_stopwords)):
        marathi_stopwords[i] = re.sub('\n','',marathi_stopwords[i])
stopwords = english_stopwords + marathi_stopwords
english_stemmer = SnowballStemmer("english")

In [71]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

## **Reading Training Data**

In [72]:
df=pd.read_excel("trainingSet/mr_Hasoc2021_train 2 (1).xlsx")

In [73]:
df.head()

Unnamed: 0,text_id,text,task_1
0,hasoc_mr_1,भारत 15 ऑगस्ट 1947 ला स्वतंत्र झाला आणि त्यानं...,NOT
1,hasoc_mr_2,स्वत ला हवा तसा बाइट किंवा प्रतिक्रिया घेण्यास...,NOT
2,hasoc_mr_3,5 व्या नंबरची अर्थव्यवस्था आहे भारताची जगात 20...,NOT
3,hasoc_mr_4,च्यायला म्हणजे दुबईचा फोन ही पुडीच निघाली की.,HOF
4,hasoc_mr_5,ह्याला खरंतर कधीच आत टाकला पाहिजे होता. पैसा आ...,HOF


In [74]:
tweets = df.text
y = df.task_1

## **Cleaning Training Data**

In [75]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

In [11]:
cleaned_tweets[0]

'भारत ऑगस्ट ला स्वतंत्र त्यानंतर तब्बल वर्षांनी जानेवारी साली भारताला राज्यघ'

## **Loading DistilBert**

In [82]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

## **Embedding Training Data**

In [13]:
X = cleaned_tweets
cleaned_tweets = []

In [14]:
tokenized_input = tokenizer(
        X[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [15]:
print(tokenized_input['input_ids'].cpu().detach().numpy().shape)
print(tokenized_input['attention_mask'].cpu().detach().numpy().shape)
print(sample_output.logits.cpu().detach().numpy().shape)

(1, 49)
(1, 49)
(1, 49, 30522)


In [16]:
tokenized_input = tokenizer(
        X,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

In [17]:
tokenized_input['input_ids'] = tokenized_input['input_ids'].cpu().detach().numpy()
tokenized_input['attention_mask'] = tokenized_input['attention_mask'].cpu().detach().numpy()

In [18]:
pca = PCA(n_components=3)

In [26]:
output = []

j = 1

for i in range(len(X)):
    
    input = {}
    input['input_ids'] = torch.from_numpy(tokenized_input['input_ids'][i].reshape(1, 136))
    input['attention_mask'] = torch.from_numpy(tokenized_input['attention_mask'][i].reshape(1, 136))

    model_output = model(**input)
    model_output = model_output.logits.cpu().detach().numpy()
    output.append(model_output)
    if ((i + 1) % 200) == 0:
        output = np.array(output)
        output = output.reshape(len(output), 4150992)
        output = pca.fit_transform(output)
        file_name = "embed_feat/output" + str(j) + ".txt"
        with open(file_name, "wb") as fp:
            pickle.dump(output, fp)
        print(file_name + " done")
        output = []
        j += 1

output = np.array(output)
output = output.reshape(len(output), 4150992)
output = pca.fit_transform(output)        
file_name = "embed_feat/output" + str(j) + ".txt"
with open(file_name, "wb") as fp:
    pickle.dump(output, fp)
print(file_name + " done")
output = []

embed_feat/output1.txt done
embed_feat/output2.txt done
embed_feat/output3.txt done
embed_feat/output4.txt done
embed_feat/output5.txt done
embed_feat/output6.txt done
embed_feat/output7.txt done
embed_feat/output8.txt done
embed_feat/output9.txt done
embed_feat/output10.txt done


## **Loading Embedded Data from Disk**

In [27]:
output = []

for i in range(10):
    file_name = "./embed_feat/output" + str(i + 1) + ".txt"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

./embed_feat/output1.txt done
./embed_feat/output2.txt done
./embed_feat/output3.txt done
./embed_feat/output4.txt done
./embed_feat/output5.txt done
./embed_feat/output6.txt done
./embed_feat/output7.txt done
./embed_feat/output8.txt done
./embed_feat/output9.txt done
./embed_feat/output10.txt done


## **Test Train Split**

In [30]:
X_train, X_val, y_train, y_val = train_test_split(output, y, test_size=0.2, random_state=42)

## **Logistic Regression**

In [31]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
lr_pred = lr.predict(X_val)
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

         HOF       0.54      0.22      0.32       138
         NOT       0.66      0.89      0.76       237

    accuracy                           0.65       375
   macro avg       0.60      0.56      0.54       375
weighted avg       0.62      0.65      0.60       375



## **Support Vector Machine**

In [36]:
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [37]:
svc_pred = svc.predict(X_val)
print(classification_report(y_val, svc_pred))

              precision    recall  f1-score   support

         HOF       0.57      0.25      0.34       138
         NOT       0.67      0.89      0.76       237

    accuracy                           0.65       375
   macro avg       0.62      0.57      0.55       375
weighted avg       0.63      0.65      0.61       375



## **Naive Bayes**

In [38]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [39]:
nb_pred = nb.predict(X_val)
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

         HOF       0.56      0.25      0.35       138
         NOT       0.67      0.89      0.76       237

    accuracy                           0.65       375
   macro avg       0.62      0.57      0.56       375
weighted avg       0.63      0.65      0.61       375



## **Stochastic Gradient Descent**

In [40]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [41]:
sgd_pred = sgd.predict(X_val)
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

         HOF       0.43      0.51      0.47       138
         NOT       0.68      0.59      0.63       237

    accuracy                           0.57       375
   macro avg       0.55      0.55      0.55       375
weighted avg       0.58      0.57      0.57       375



## **K Nearest Neighbours**

In [42]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [43]:
knn_pred = knn.predict(X_val)
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

         HOF       0.49      0.36      0.41       138
         NOT       0.68      0.78      0.72       237

    accuracy                           0.62       375
   macro avg       0.58      0.57      0.57       375
weighted avg       0.61      0.62      0.61       375



## **Decision Trees**

In [44]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [45]:
dt_pred = dt.predict(X_val)
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

         HOF       0.43      0.41      0.42       138
         NOT       0.67      0.69      0.68       237

    accuracy                           0.59       375
   macro avg       0.55      0.55      0.55       375
weighted avg       0.58      0.59      0.58       375



## **Random Forest**

In [46]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [47]:
rf_pred = rf.predict(X_val)
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

         HOF       0.55      0.33      0.42       138
         NOT       0.68      0.84      0.76       237

    accuracy                           0.66       375
   macro avg       0.62      0.59      0.59       375
weighted avg       0.64      0.66      0.63       375



## **Ensemble - Majority Voting**

In [52]:
y_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    predictions = [lr_pred[i], svc_pred[i], nb_pred[i], sgd_pred[i], knn_pred[i], dt_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 'HOF': one += 1
        if pred == 'NOT': zero +=1
    if one > zero: y_pred.append('HOF')
    else: y_pred.append('NOT')

y_pred = np.array(y_pred)

In [53]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.58      0.25      0.35       138
         NOT       0.67      0.89      0.77       237

    accuracy                           0.66       375
   macro avg       0.62      0.57      0.56       375
weighted avg       0.64      0.66      0.61       375



## **Loading Test Data**

In [76]:
df_test=pd.read_csv("testSet/hasoc2021_mr_test-blind-2021.csv")

In [77]:
df_test.head()

Unnamed: 0,text_id,text
0,hasoc_mr_1875,माणसाची सर्वात मोठी समस्या इतिहासाचे दाखले द...
1,hasoc_mr_1876,मुद्दामहून थुकत बसला ते पण दोस्तांना बोलवून म...
2,hasoc_mr_1877,तुझे नोटिफिकेशन ऑन केलेत परंतु नाव दिसलं का शि...
3,hasoc_mr_1878,पांडुरंग हरी वासुदेव हरी तु तुझ्या घरी मी म...
4,hasoc_mr_1879,भेंचोद जेव्हा कोरोना कोरोना चालू होतं तेव्हा ...


In [78]:
test_tweets = df_test.text
tweet_ids = df_test.text_id

## **Cleaning Test Data**

In [79]:
cleaned_test = [clean_tweet(tweet) for tweet in test_tweets]

## **Loading DistilBert**

In [84]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

## **Embedding Test Data**

In [80]:
X = cleaned_test
cleaned_test = []

In [85]:
tokenized_input = tokenizer(
        X[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

In [86]:
print(tokenized_input['input_ids'].cpu().detach().numpy().shape)
print(tokenized_input['attention_mask'].cpu().detach().numpy().shape)
print(sample_output.logits.cpu().detach().numpy().shape)

(1, 63)
(1, 63)
(1, 63, 30522)


In [87]:
tokenized_input = tokenizer(
        X,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

In [88]:
tokenized_input['input_ids'] = tokenized_input['input_ids'].cpu().detach().numpy()
tokenized_input['attention_mask'] = tokenized_input['attention_mask'].cpu().detach().numpy()

In [89]:
pca = PCA(n_components=3)

In [92]:
input = {}
input['input_ids'] = torch.from_numpy(tokenized_input['input_ids'][0].reshape(1, 141))
input['attention_mask'] = torch.from_numpy(tokenized_input['attention_mask'][0].reshape(1, 141))

model_output = model(**input)
model_output = model_output.logits.cpu().detach().numpy()

In [93]:
model_output.shape

(1, 141, 30522)

In [95]:
output = []

j = 1

for i in range(len(X)):
    
    input = {}
    input['input_ids'] = torch.from_numpy(tokenized_input['input_ids'][i].reshape(1, 141))
    input['attention_mask'] = torch.from_numpy(tokenized_input['attention_mask'][i].reshape(1, 141))

    model_output = model(**input)
    model_output = model_output.logits.cpu().detach().numpy()
    output.append(model_output)
    if ((i + 1) % 200) == 0:
        output = np.array(output)
        output = output.reshape(len(output), 4303602)
        output = pca.fit_transform(output)
        file_name = "test_embed_feat/output" + str(j) + ".txt"
        with open(file_name, "wb") as fp:
            pickle.dump(output, fp)
        print(file_name + " done")
        output = []
        j += 1

output = np.array(output)
output = output.reshape(len(output), 4303602)
output = pca.fit_transform(output)        
file_name = "test_embed_feat/output" + str(j) + ".txt"
with open(file_name, "wb") as fp:
    pickle.dump(output, fp)
print(file_name + " done")
output = []

test_embed_feat/output1.txt done
test_embed_feat/output2.txt done
test_embed_feat/output3.txt done
test_embed_feat/output4.txt done


## **Loading Embedded Test Data from Disk**

In [96]:
output = []

for i in range(4):
    file_name = "./test_embed_feat/output" + str(i + 1) + ".txt"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

./test_embed_feat/output1.txt done
./test_embed_feat/output2.txt done
./test_embed_feat/output3.txt done
./test_embed_feat/output4.txt done


## **Obtain Prediction from Model**

In [97]:
X_test = output
output = []

In [102]:
lr_pred = lr.predict(X_test)
svc_pred = svc.predict(X_test)
nb_pred = nb.predict(X_test)
sgd_pred = sgd.predict(X_test)
knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)

In [103]:
y_test = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    predictions = [lr_pred[i], svc_pred[i], nb_pred[i], sgd_pred[i], knn_pred[i], dt_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 'HOF': one += 1
        if pred == 'NOT': zero +=1
    if one > zero: y_test.append('HOF')
    else: y_test.append('NOT')

y_test = np.array(y_test)

In [105]:
submission = {'id':tweet_ids,'label':y_test}
submission = pd.DataFrame(submission)

In [106]:
submission.head()

Unnamed: 0,id,label
0,hasoc_mr_1875,NOT
1,hasoc_mr_1876,NOT
2,hasoc_mr_1877,NOT
3,hasoc_mr_1878,NOT
4,hasoc_mr_1879,NOT


In [108]:
submission.to_csv('marathi_distil_bert_ens.csv', index = False)