## Environment Setup

In [1]:
# ! pip install -q transformers
# ! pip install -q sentencepiece
# ! pip install -q nltk
# ! git clone https://github.com/bhargav25dave1996/ICHCL_baseline.git
# % cd /content/ICHCL_baseline

## Importing Libraries

In [2]:
import nltk
nltk.download('stopwords')

import pandas as pd
import numpy as np
from glob import glob
import re
import json

import time
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import stemmer as hindi_stemmer

import torch
import tensorflow as tf

[nltk_data] Downloading package stopwords to /home/sri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2021-08-21 18:58:33.906299: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-21 18:58:33.906360: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
english_stopwords = stopwords.words("english")
with open('final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])
stopwords = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

## Reading Data

In [4]:
train_directories = []
for i in glob("data/train/*/"):
    for j in glob(i+'*/'):
        train_directories.append(j)

In [5]:
data = []
for i in train_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))
labels = []
for i in train_directories:
    with open(i+'labels.json', encoding='utf-8') as f:
        labels.append(json.load(f))

In [6]:
def tr_flatten(d,l):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        'label':l[d['tweet_id']]
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], #flattening comments(appending one after the other)
                'label':l[i['tweet_id']]
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], #flattening replies
                        'label':l[j['tweet_id']]
                    })
    return flat_text

def te_flatten(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] + i['tweet'],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] + i['tweet'] + j['tweet'],
                    })
    return flat_text

In [7]:
data_label = []
#for train
for i in range(len(labels)):
    for j in tr_flatten(data[i], labels[i]):
        data_label.append(j)
train_len = len(data_label)

In [8]:
df = pd.DataFrame(data_label, columns = data_label[0].keys(), index = None)

In [9]:
tweets = df.text
y = df.label

## Preprocessing

In [10]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopwords:
            token = english_stemmer.stem(token)
            token = hindi_stemmer.hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [11]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

In [12]:
'''
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()
'''

'\nvectorizer = TfidfVectorizer(min_df = 5)\nX = vectorizer.fit_transform(cleaned_tweets)\nX = X.todense()\n'

In [13]:
X = cleaned_tweets

In [14]:
y = y.to_list()

for i in range(len(y)):
    if y[i] == 'HOF':
        y[i] = 1
    else:
        y[i] = 0

## Test-Train Split

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Load Model

In [16]:
from transformers import AutoTokenizer, AutoModel, pipeline
  
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

model = AutoModel.from_pretrained("ai4bharat/indic-bert")

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.decoder.bias', 'predictions.bias', 'sop_classifier.classifier.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'sop_classifier.classifier.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Sample Output

In [18]:
tokenized_input = tokenizer(
        X_train[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

sample_output = model(**tokenized_input)

print(sample_output)
print(type(sample_output))

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.0135,  0.0037, -0.0251,  ..., -0.0244,  0.0099, -0.0095],
         [ 0.2095, -0.0659, -0.1277,  ...,  0.1147, -0.0254, -0.2286],
         [ 0.3501, -0.4017, -0.1892,  ...,  0.0837,  0.1546, -0.2720],
         ...,
         [ 0.5263, -0.2966, -0.1900,  ...,  0.2064,  0.0903, -0.4016],
         [ 0.2850, -0.1415, -0.3030,  ...,  0.0143,  0.0323, -0.6834],
         [-0.0135,  0.0037, -0.0251,  ..., -0.0244,  0.0099, -0.0095]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.0575,  0.0494,  0.0119, -0.0042,  0.0405,  0.0829,  0.0384, -0.0146,
         -0.0115,  0.0616, -0.0072,  0.0472,  0.0190, -0.0166,  0.0470, -0.0213,
         -0.0416,  0.0014, -0.1177, -0.0831,  0.0768,  0.0778, -0.0334, -0.0972,
          0.0065, -0.0343, -0.0522, -0.0046,  0.0273,  0.0091,  0.0234,  0.0004,
          0.0096,  0.0340, -0.0018, -0.0371, -0.0216, -0.0156,  0.1124,  0.0201,
          0.0282, -0.0235, -0.1167,  0.0445, -0.068

## Embedding the Training Data

In [19]:
print(len(X_train))

4592


In [20]:
input = []

for text in X_train:
    tokenized_input = tokenizer(
        text,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

print(len(input))

4592


In [21]:
output = []

j = 1

for i in range(len(input)):
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    if ((i + 1) % 200) == 0:
        file_name = "./X_train/output" + str(j) + ".txt"
        with open(file_name, "wb") as fp:   #Pickling
            pickle.dump(output, fp)
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./X_train/output" + str(j) + ".txt"
with open(file_name, "wb") as fp:   #Pickling
    pickle.dump(output, fp)
print(file_name + " done")
output = []

./X_train/output1.txt done
./X_train/output2.txt done
./X_train/output3.txt done
./X_train/output4.txt done
./X_train/output5.txt done
./X_train/output6.txt done
./X_train/output7.txt done
./X_train/output8.txt done
./X_train/output9.txt done
./X_train/output10.txt done
./X_train/output11.txt done
./X_train/output12.txt done
./X_train/output13.txt done
./X_train/output14.txt done
./X_train/output15.txt done
./X_train/output16.txt done
./X_train/output17.txt done
./X_train/output18.txt done
./X_train/output19.txt done
./X_train/output20.txt done
./X_train/output21.txt done
./X_train/output22.txt done
./X_train/output23.txt done


In [22]:
output = []

for i in range(23):
    file_name = "./X_train/output" + str(i + 1) + ".txt"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

./X_train/output1.txt done
./X_train/output2.txt done
./X_train/output3.txt done
./X_train/output4.txt done
./X_train/output5.txt done
./X_train/output6.txt done
./X_train/output7.txt done
./X_train/output8.txt done
./X_train/output9.txt done
./X_train/output10.txt done
./X_train/output11.txt done
./X_train/output12.txt done
./X_train/output13.txt done
./X_train/output14.txt done
./X_train/output15.txt done
./X_train/output16.txt done
./X_train/output17.txt done
./X_train/output18.txt done
./X_train/output19.txt done
./X_train/output20.txt done
./X_train/output21.txt done
./X_train/output22.txt done
./X_train/output23.txt done


In [23]:
X_train = output
output = []

In [24]:
print(len(X_train))

4592


## Embedding the Testing Data

In [25]:
input = []

for text in X_val:
    tokenized_input = tokenizer(
        text,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

print(len(input))

1148


In [26]:
output = []

j = 1

for i in range(len(input)):
    model_output = model(**input[i])
    model_output = model_output.pooler_output.cpu().detach().numpy()
    output.append(model_output)
    if ((i + 1) % 200) == 0:
        file_name = "./X_val/output" + str(j) + ".txt"
        with open(file_name, "wb") as fp:   #Pickling
            pickle.dump(output, fp)
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./X_val/output" + str(j) + ".txt"
with open(file_name, "wb") as fp:   #Pickling
    pickle.dump(output, fp)
print(file_name + " done")
output = []

./X_val/output1.txt done
./X_val/output2.txt done
./X_val/output3.txt done
./X_val/output4.txt done
./X_val/output5.txt done
./X_val/output6.txt done


In [27]:
output = []

for i in range(6):
    file_name = "./X_val/output" + str(i + 1) + ".txt"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

./X_val/output1.txt done
./X_val/output2.txt done
./X_val/output3.txt done
./X_val/output4.txt done
./X_val/output5.txt done
./X_val/output6.txt done


In [28]:
X_val = output
output = []
print(len(X_val))

1148


## Logistic Regression