## Environment Setup

In [1]:
! pip install -q transformers
! pip install -q sentencepiece
! pip install -q nltk
! git clone https://github.com/bhargav25dave1996/ICHCL_baseline.git
% cd /content/ICHCL_baseline

[K     |████████████████████████████████| 2.6 MB 28.1 MB/s 
[K     |████████████████████████████████| 895 kB 68.0 MB/s 
[K     |████████████████████████████████| 636 kB 62.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 67.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 30.6 MB/s 
[?25hCloning into 'ICHCL_baseline'...
remote: Enumerating objects: 270, done.[K
remote: Counting objects: 100% (270/270), done.[K
remote: Compressing objects: 100% (262/262), done.[K
remote: Total 270 (delta 5), reused 267 (delta 5), pack-reused 0[K
Receiving objects: 100% (270/270), 477.18 KiB | 14.46 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/ICHCL_baseline


In [2]:
! mkdir X_train
! mkdir X_val
! mkdir y_train
! mkdir y_val

## Importing Libraries

In [3]:
import nltk
nltk.download('stopwords')

import pandas as pd
import numpy as np
from glob import glob
import re
import json

import time
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import stemmer as hindi_stemmer

import torch
import tensorflow as tf

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
english_stopwords = stopwords.words("english")
with open('final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])
stopwords = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

## Reading Data

In [5]:
train_directories = []
for i in glob("data/train/*/"):
    for j in glob(i+'*/'):
        train_directories.append(j)

In [6]:
data = []
for i in train_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))
labels = []
for i in train_directories:
    with open(i+'labels.json', encoding='utf-8') as f:
        labels.append(json.load(f))

In [7]:
def tr_flatten(d,l):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        'label':l[d['tweet_id']]
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], #flattening comments(appending one after the other)
                'label':l[i['tweet_id']]
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], #flattening replies
                        'label':l[j['tweet_id']]
                    })
    return flat_text

def te_flatten(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] + i['tweet'],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] + i['tweet'] + j['tweet'],
                    })
    return flat_text

In [8]:
data_label = []
#for train
for i in range(len(labels)):
    for j in tr_flatten(data[i], labels[i]):
        data_label.append(j)
train_len = len(data_label)

In [9]:
df = pd.DataFrame(data_label, columns = data_label[0].keys(), index = None)

In [10]:
tweets = df.text
y = df.label

## Preprocessing

In [11]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopwords:
            token = english_stemmer.stem(token)
            token = hindi_stemmer.hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [12]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

In [13]:
'''
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()
'''

'\nvectorizer = TfidfVectorizer(min_df = 5)\nX = vectorizer.fit_transform(cleaned_tweets)\nX = X.todense()\n'

In [14]:
X = cleaned_tweets

In [15]:
y = y.to_list()

for i in range(len(y)):
    if y[i] == 'HOF':
        y[i] = 1
    else:
        y[i] = 0

## Test-Train Split

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Load Model

In [17]:
from transformers import AutoTokenizer, AutoModel
  
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

model = AutoModel.from_pretrained("ai4bharat/indic-bert")

Downloading:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['sop_classifier.classifier.bias', 'sop_classifier.classifier.weight', 'predictions.dense.weight', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Sample Output

In [18]:
tokenized_input = tokenizer(
        X_train[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

print(sample_output)
print(sample_output.pooler_output.cpu().detach().numpy().shape)

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.0107, -0.0023, -0.0206,  ..., -0.0217, -0.0068, -0.0107],
         [ 0.3941, -0.7381,  0.0244,  ...,  0.3811, -0.1700,  0.1687],
         [ 0.1566, -0.4402,  0.2589,  ...,  0.3415, -0.1641,  0.0793],
         ...,
         [ 0.2708, -0.3056, -0.0631,  ...,  0.1474,  0.2061,  0.0272],
         [-0.1811, -0.0760, -0.1348,  ...,  0.0109,  0.0804,  0.0891],
         [-0.0107, -0.0023, -0.0206,  ..., -0.0217, -0.0068, -0.0107]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.0577,  0.0521,  0.0182, -0.0068,  0.0419,  0.0830,  0.0382, -0.0124,
         -0.0127,  0.0590, -0.0062,  0.0485,  0.0181, -0.0147,  0.0536, -0.0135,
         -0.0389,  0.0127, -0.1167, -0.0864,  0.0853,  0.0781, -0.0314, -0.0935,
          0.0068, -0.0357, -0.0524, -0.0108,  0.0235,  0.0055,  0.0228,  0.0018,
          0.0124,  0.0352,  0.0030, -0.0375, -0.0258, -0.0221,  0.1135,  0.0268,
          0.0249, -0.0166, -0.1103,  0.0427, -0.070

## Embedding the Training Data

In [19]:
print(len(X_train))

4592


In [20]:
input = []

for text in X_train:
    tokenized_input = tokenizer(
        text,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

print(len(input))

4592


## Saving X_train to disk

In [21]:
output = []

j = 1

for i in range(len(input)):
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    if ((i + 1) % 200) == 0:
        file_name = "./X_train/output" + str(j) + ".txt"
        with open(file_name, "wb") as fp:   #Pickling
            pickle.dump(output, fp)
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./X_train/output" + str(j) + ".txt"
with open(file_name, "wb") as fp:   #Pickling
    pickle.dump(output, fp)
print(file_name + " done")
output = []

./X_train/output1.txt done
./X_train/output2.txt done
./X_train/output3.txt done
./X_train/output4.txt done
./X_train/output5.txt done
./X_train/output6.txt done
./X_train/output7.txt done
./X_train/output8.txt done
./X_train/output9.txt done
./X_train/output10.txt done
./X_train/output11.txt done
./X_train/output12.txt done
./X_train/output13.txt done
./X_train/output14.txt done
./X_train/output15.txt done
./X_train/output16.txt done
./X_train/output17.txt done
./X_train/output18.txt done
./X_train/output19.txt done
./X_train/output20.txt done
./X_train/output21.txt done
./X_train/output22.txt done
./X_train/output23.txt done


## Loading X_train from disk

In [22]:
output = []

for i in range(23):
    file_name = "./X_train/output" + str(i + 1) + ".txt"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

./X_train/output1.txt done
./X_train/output2.txt done
./X_train/output3.txt done
./X_train/output4.txt done
./X_train/output5.txt done
./X_train/output6.txt done
./X_train/output7.txt done
./X_train/output8.txt done
./X_train/output9.txt done
./X_train/output10.txt done
./X_train/output11.txt done
./X_train/output12.txt done
./X_train/output13.txt done
./X_train/output14.txt done
./X_train/output15.txt done
./X_train/output16.txt done
./X_train/output17.txt done
./X_train/output18.txt done
./X_train/output19.txt done
./X_train/output20.txt done
./X_train/output21.txt done
./X_train/output22.txt done
./X_train/output23.txt done


In [23]:
X_train = output
output = []

In [24]:
print(len(X_train))

4592


## Embedding the Testing Data

In [25]:
input = []

for text in X_val:
    tokenized_input = tokenizer(
        text,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

print(len(input))

1148


## Saving X_val to disk

In [26]:
output = []

j = 1

for i in range(len(input)):
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    if ((i + 1) % 200) == 0:
        file_name = "./X_val/output" + str(j) + ".txt"
        with open(file_name, "wb") as fp:   #Pickling
            pickle.dump(output, fp)
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./X_val/output" + str(j) + ".txt"
with open(file_name, "wb") as fp:   #Pickling
    pickle.dump(output, fp)
print(file_name + " done")
output = []

./X_val/output1.txt done
./X_val/output2.txt done
./X_val/output3.txt done
./X_val/output4.txt done
./X_val/output5.txt done
./X_val/output6.txt done


## Loading X_val from disk

In [27]:
output = []

for i in range(6):
    file_name = "./X_val/output" + str(i + 1) + ".txt"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

./X_val/output1.txt done
./X_val/output2.txt done
./X_val/output3.txt done
./X_val/output4.txt done
./X_val/output5.txt done
./X_val/output6.txt done


In [28]:
X_val = output
output = []
print(len(X_val))

1148


## Saving y_train & y_val to disk

In [29]:
with open("./y_train/y_train.txt", "wb") as fp:
    pickle.dump(y_train, fp)

with open("./y_val/y_val.txt", "wb") as fp:
    pickle.dump(y_val, fp)

## Loading y_train & y_val from disk

In [30]:
with open('./y_train/y_train.txt', "rb") as fp:
        y_train = pickle.load(fp)

with open('./y_val/y_val.txt', "rb") as fp:
        y_val = pickle.load(fp)

## Logistic Regression

In [31]:
# Preprocessing

X_train = np.array(X_train)
X_train = X_train.reshape(4592,768)

X_val = np.array(X_val)
X_val = X_val.reshape(1148, 768)

y_train = np.array(y_train)
y_val = np.array(y_val)

In [32]:
# Loading the Model

classifier = LogisticRegression()

In [33]:
# Training the Model

classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
# Evaluating the Model

y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.63      0.62       579
           1       0.62      0.60      0.61       569

    accuracy                           0.62      1148
   macro avg       0.62      0.62      0.62      1148
weighted avg       0.62      0.62      0.62      1148



## Support Vector Machine

In [37]:
from sklearn.svm import SVC

In [39]:
classifier = SVC()
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [40]:
y_pred = classifier.predict(X_val)

In [41]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.75      0.65       579
           1       0.63      0.44      0.52       569

    accuracy                           0.59      1148
   macro avg       0.60      0.59      0.58      1148
weighted avg       0.60      0.59      0.58      1148



## Naive Bayes

In [43]:
from sklearn.naive_bayes import GaussianNB

In [44]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [45]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.61      0.61       579
           1       0.60      0.61      0.61       569

    accuracy                           0.61      1148
   macro avg       0.61      0.61      0.61      1148
weighted avg       0.61      0.61      0.61      1148



## Stochastic Gradient Descent

In [46]:
from sklearn.linear_model import SGDClassifier

In [47]:
classifier = SGDClassifier()
classifier.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [48]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.57      0.60       579
           1       0.60      0.66      0.63       569

    accuracy                           0.62      1148
   macro avg       0.62      0.62      0.62      1148
weighted avg       0.62      0.62      0.62      1148



## K-Nearest Neighbours

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [50]:
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [51]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.60      0.63       579
           1       0.63      0.68      0.65       569

    accuracy                           0.64      1148
   macro avg       0.64      0.64      0.64      1148
weighted avg       0.64      0.64      0.64      1148



## Decision Tree

In [52]:
from sklearn.tree import DecisionTreeClassifier

In [53]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [54]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.58      0.58       579
           1       0.57      0.57      0.57       569

    accuracy                           0.58      1148
   macro avg       0.58      0.58      0.58      1148
weighted avg       0.58      0.58      0.58      1148



## Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Ensemble - Voting

In [57]:
# Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)

In [58]:
# Support Vector Machine

svc = SVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_val)

In [59]:
# Naive Bayes

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)

In [60]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)

In [61]:
# K Nearest Neighbour

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)

In [62]:
# Decision Tree

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)

In [67]:
# Random Forest

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)

In [73]:
# Voting

y_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    predictions = [lr_pred[i], svc_pred[i], nb_pred[i], sgd_pred[i], knn_pred[i], dt_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 1: one += 1
        if pred == 0: zero +=1
    if one > zero: y_pred.append(1)
    else: y_pred.append(0)

y_pred = np.array(y_pred)

In [74]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.61      0.65       579
           1       0.64      0.71      0.68       569

    accuracy                           0.66      1148
   macro avg       0.66      0.66      0.66      1148
weighted avg       0.66      0.66      0.66      1148

