## Environment Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
% cd /content/drive/MyDrive/Colab Notebooks/ICHCL_baseline

/content/drive/MyDrive/Colab Notebooks/ICHCL_baseline


## Importing Libraries

In [3]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json
import nltk
nltk.download('stopwords')


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
import tensorflow as tf


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import stemmer as hindi_stemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Initializing Stopwords and Stemmers

In [4]:
english_stopwords = stopwords.words("english")
with open('final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])
stopwords = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

## Reading Data

In [5]:
train_directories = []
for i in glob("data/train/*/"):
    for j in glob(i+'*/'):
        train_directories.append(j)

In [6]:
data = []
for i in train_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))
labels = []
for i in train_directories:
    with open(i+'labels.json', encoding='utf-8') as f:
        labels.append(json.load(f))

In [7]:
def tr_flatten(d,l):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        'label':l[d['tweet_id']]
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], #flattening comments(appending one after the other)
                'label':l[i['tweet_id']]
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], #flattening replies
                        'label':l[j['tweet_id']]
                    })
    return flat_text

def te_flatten(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] + i['tweet'],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] + i['tweet'] + j['tweet'],
                    })
    return flat_text

In [8]:
data_label = []
#for train
for i in range(len(labels)):
    for j in tr_flatten(data[i], labels[i]):
        data_label.append(j)
train_len = len(data_label)

In [9]:
df = pd.DataFrame(data_label, columns = data_label[0].keys(), index = None)

In [10]:
tweets = df.text
y = df.label

## Preprocessing

In [11]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopwords:
            token = english_stemmer.stem(token)
            token = hindi_stemmer.hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [12]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

## Featuring Raw Text

In [13]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()

## Test Train Split

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

In [15]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.72      0.72      0.72       566
        NONE       0.73      0.73      0.73       582

    accuracy                           0.73      1148
   macro avg       0.73      0.73      0.73      1148
weighted avg       0.73      0.73      0.73      1148



## Ensembling-Voting

In [15]:
# Importing Libraries

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [16]:
# Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)

In [17]:
# Support Vector Machine

svc = SVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_val)

In [18]:
# Naive Bayes

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)

In [19]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)

In [20]:
# K Nearest Neighbour

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)

In [21]:
# Decision Tree

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)

In [22]:
# Random Forest

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)

In [23]:
# Voting

y_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    predictions = [lr_pred[i], svc_pred[i], nb_pred[i], sgd_pred[i], knn_pred[i], dt_pred[i], rf_pred[i]]
    # predictions = [lr_pred[i], nb_pred[i], sgd_pred[i], dt_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 'HOF': one += 1
        if pred == 'NONE': zero +=1
    if one > zero: y_pred.append('HOF')
    else: y_pred.append('NONE')

y_pred = np.array(y_pred)

In [24]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.73      0.71      0.72       566
        NONE       0.72      0.74      0.73       582

    accuracy                           0.72      1148
   macro avg       0.72      0.72      0.72      1148
weighted avg       0.72      0.72      0.72      1148



## Neural Network 1

In [34]:
le = LabelEncoder() #label encoding labels for training Dense Neural Network
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [35]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy']) #compiling a neural network with 3 layers for classification

In [36]:
model.fit(X_train, y_train, epochs = 5, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fdf031fc890>

In [37]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

In [38]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.70      0.70       566
           1       0.71      0.72      0.71       582

    accuracy                           0.71      1148
   macro avg       0.71      0.71      0.71      1148
weighted avg       0.71      0.71      0.71      1148



## Neural Network 2

In [62]:
class MyThresholdCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold):
        super(MyThresholdCallback, self).__init__()
        self.threshold = threshold
 
    def on_epoch_end(self, epoch, logs=None): 
        val_acc = logs["val_accuracy"]
        if val_acc >= self.threshold:
            self.model.stop_training = True

In [77]:
callback = MyThresholdCallback(threshold=0.73)

In [78]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dropout(0.8),
        Dense(32, activation="relu"),
        Dropout(0.6),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [79]:
model.fit(X_train, y_train, epochs = 1000, batch_size = 64, validation_data=(X_val, y_val), callbacks=[callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000


<keras.callbacks.History at 0x7fdee3e5b990>

In [80]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))

In [81]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.74      0.73       566
           1       0.74      0.72      0.73       582

    accuracy                           0.73      1148
   macro avg       0.73      0.73      0.73      1148
weighted avg       0.73      0.73      0.73      1148



## Neural Network 3

In [140]:
class MyThresholdCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold):
        super(MyThresholdCallback, self).__init__()
        self.threshold = threshold
 
    def on_epoch_end(self, epoch, logs=None): 
        val_acc = logs["val_accuracy"]
        if val_acc >= self.threshold:
            self.model.stop_training = True

In [141]:
callback = MyThresholdCallback(threshold=0.74)

In [142]:
model = Sequential(
    [
        Dense(32, activation="relu"),
        Dropout(0.8),
        Dense(16, activation="relu"),
        Dropout(0.6),
        Dense(8, activation="sigmoid"),
        Dropout(0.4),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [143]:
model.fit(X_train, y_train, epochs = 1000, batch_size = 64, validation_data=(X_val, y_val), callbacks=[callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000


<keras.callbacks.History at 0x7fdef8a9e350>

In [144]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))

In [145]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73       566
           1       0.73      0.77      0.75       582

    accuracy                           0.74      1148
   macro avg       0.74      0.74      0.74      1148
weighted avg       0.74      0.74      0.74      1148



In [146]:
model.save('saved_models/nn_3')

INFO:tensorflow:Assets written to: saved_models/nn_3/assets


## Neural Network 4

In [112]:
class MyThresholdCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold):
        super(MyThresholdCallback, self).__init__()
        self.threshold = threshold
 
    def on_epoch_end(self, epoch, logs=None): 
        val_acc = logs["val_accuracy"]
        if val_acc >= self.threshold:
            self.model.stop_training = True

In [120]:
callback = MyThresholdCallback(threshold=0.75)

In [138]:
model = Sequential(
    [
        Dense(32, activation="relu"),
        Dropout(0.8),
        BatchNormalization(),
        Dense(32, activation="relu"),
        Dropout(0.8),
        Dense(16, activation="relu"),
        Dropout(0.6),
        Dense(8, activation="sigmoid"),
        Dropout(0.4),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [139]:
model.fit(X_train, y_train, epochs = 1000, batch_size = 64, validation_data=(X_val, y_val), callbacks=[callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x7fdef8c7f3d0>

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))

In [None]:
print(classification_report(y_val, y_pred))

## Loading Test Data

In [25]:
test_directories = []
for i in glob("data/test/*/"):
    for j in glob(i+'*/'):
        test_directories.append(j)

In [26]:
test_data = []
for i in test_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))

In [27]:
test_tweetid_data = []
#for test
for i in range(len(labels), len(data)):
    for j in te_flatten(data[i]):
        test_tweetid_data.append(j)

In [28]:
test_df = pd.DataFrame(test_tweetid_data, columns = test_tweetid_data[0].keys(), index = None)

In [29]:
test_df.head()

Unnamed: 0,tweet_id,text
0,1396844054818680835,Bhadva Ramdev was brought to a debate about Al...
1,1396844158283776004,Bhadva Ramdev was brought to a debate about Al...
2,1397043581446098945,Bhadva Ramdev was brought to a debate about Al...
3,1398265913749635073,Bhadva Ramdev was brought to a debate about Al...
4,1396852220268716032,Bhadva Ramdev was brought to a debate about Al...


## Preprocessing Test Data

In [30]:
test_tweets = test_df.text
tweet_ids = test_df.tweet_id

In [31]:
cleaned_test = [clean_tweet(tweet) for tweet in test_tweets]

In [32]:
X_test = vectorizer.transform(cleaned_test)
X_test = X_test.todense()

## Making Prediction from Test Data - Ensemble

In [33]:
lr_pred = lr.predict(X_test)
svc_pred = svc.predict(X_test)
nb_pred = nb.predict(X_test)
sgd_pred = sgd.predict(X_test)
knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)

In [34]:
# Voting

submission_prediction = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    predictions = [lr_pred[i], svc_pred[i], nb_pred[i], sgd_pred[i], knn_pred[i], dt_pred[i], rf_pred[i]]
    # predictions = [lr_pred[i], nb_pred[i], sgd_pred[i], dt_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 'HOF': one += 1
        if pred == 'NONE': zero +=1
    if one > zero: submission_prediction.append('HOF')
    else: submission_prediction.append('NONE')

submission_prediction = np.array(submission_prediction)

## Making Prediction from Test Data - Neural Network

In [160]:
submission_prediction = model.predict(X_test)

In [161]:
submission_prediction = (submission_prediction > 0.5).astype('int64')
submission_prediction = submission_prediction.reshape(len(submission_prediction))

In [166]:
submission_prediction = submission_prediction.tolist()

for i in range(len(submission_prediction)):
    if submission_prediction[i] == 1:
        submission_prediction[i] = 'HOF'
    else:
        submission_prediction[i] = 'NONE'

submission_prediction = np.array(submission_prediction)

## Submitting Prediction

In [36]:
submission = {'tweet_id': tweet_ids, 'label':submission_prediction}
submission = pd.DataFrame(submission)

In [37]:
submission.to_csv('data/ensemble_submission.csv', index = False)