### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout


import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import stemmer as hindi_stemmer

2021-08-20 11:31:34.584486: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-20 11:31:34.584525: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Initialising stop-words and stemmers 

In [3]:
english_stopwords = stopwords.words("english")
with open('final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])
stopwords = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

## Reading Data

In [4]:
train_directories = []
for i in glob("data/train/*/"):
    for j in glob(i+'*/'):
        train_directories.append(j)

In [5]:
# train_directories

In [6]:
data = []
for i in train_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))
labels = []
for i in train_directories:
    with open(i+'labels.json', encoding='utf-8') as f:
        labels.append(json.load(f))

In [7]:
def tr_flatten(d,l):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        'label':l[d['tweet_id']]
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], #flattening comments(appending one after the other)
                'label':l[i['tweet_id']]
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], #flattening replies
                        'label':l[j['tweet_id']]
                    })
    return flat_text

def te_flatten(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] + i['tweet'],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] + i['tweet'] + j['tweet'],
                    })
    return flat_text

In [8]:
data_label = []

for i in range(len(labels)):
    for j in tr_flatten(data[i], labels[i]):
        data_label.append(j)
train_len = len(data_label)

In [9]:
df = pd.DataFrame(data_label, columns = data_label[0].keys(), index = None)

In [10]:
df.head()

Unnamed: 0,tweet_id,text,label
0,1397101600460529665,Countries which have Banned Twitter\n\n🇨🇳 Chin...,HOF
1,1397101827116703744,Countries which have Banned Twitter\n\n🇨🇳 Chin...,NONE
2,1397101939674869763,Countries which have Banned Twitter\n\n🇨🇳 Chin...,HOF
3,1397102700173488133,Countries which have Banned Twitter\n\n🇨🇳 Chin...,HOF
4,1397102906004754433,Countries which have Banned Twitter\n\n🇨🇳 Chin...,HOF


In [11]:
df['label'].value_counts()

NONE    2899
HOF     2841
Name: label, dtype: int64

In [12]:
tweets = df.text
y = df.label

## Preprocessing

In [13]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopwords:
            token = english_stemmer.stem(token)
            token = hindi_stemmer.hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [14]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

In [15]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()

## Training and evaluating model

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

In [17]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression()

In [18]:
y_pred = classifier.predict(X_val)

In [19]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.73      0.70      0.72       577
        NONE       0.71      0.74      0.73       571

    accuracy                           0.72      1148
   macro avg       0.72      0.72      0.72      1148
weighted avg       0.72      0.72      0.72      1148



In [20]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression()

<p>Predicting and priting classification metrics for validation set.</p>

In [21]:
y_pred = classifier.predict(X_val)

In [22]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.73      0.70      0.72       577
        NONE       0.71      0.74      0.73       571

    accuracy                           0.72      1148
   macro avg       0.72      0.72      0.72      1148
weighted avg       0.72      0.72      0.72      1148



## SVM

In [23]:
classifier = SVC()

In [24]:
classifier.fit(X_train, y_train)

SVC()

In [25]:
y_pred = classifier.predict(X_val)

In [26]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.74      0.70      0.72       577
        NONE       0.71      0.75      0.73       571

    accuracy                           0.73      1148
   macro avg       0.73      0.73      0.73      1148
weighted avg       0.73      0.73      0.73      1148



## Base Neural Network

In [27]:
le = LabelEncoder() #label encoding labels for training Dense Neural Network
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [28]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy']) #compiling a neural network with 3 layers for classification

2021-08-20 11:32:38.267049: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-08-20 11:32:38.267939: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-20 11:32:38.268363: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2021-08-20 11:32:38.268755: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2021-08-20 11:32:38.269511: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

In [29]:
model.fit(X_train, y_train, epochs = 5, batch_size = 16, verbose=1)

2021-08-20 11:32:38.404714: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff5bc2865e0>

In [30]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

In [31]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.67      0.70       577
           1       0.69      0.75      0.72       571

    accuracy                           0.71      1148
   macro avg       0.71      0.71      0.71      1148
weighted avg       0.71      0.71      0.71      1148



In [32]:
le = LabelEncoder() #label encoding labels for training Dense Neural Network
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [33]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy']) #compiling a neural network with 3 layers for classification

In [34]:
model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=2)

Epoch 1/5
144/144 - 1s - loss: 0.6336 - accuracy: 0.6631
Epoch 2/5
144/144 - 0s - loss: 0.5278 - accuracy: 0.7500
Epoch 3/5
144/144 - 0s - loss: 0.4542 - accuracy: 0.7949
Epoch 4/5
144/144 - 0s - loss: 0.3946 - accuracy: 0.8267
Epoch 5/5
144/144 - 0s - loss: 0.3379 - accuracy: 0.8547


<keras.callbacks.History at 0x7ff5b02253a0>

In [35]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

In [36]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.73      0.72       577
           1       0.72      0.68      0.70       571

    accuracy                           0.71      1148
   macro avg       0.71      0.71      0.71      1148
weighted avg       0.71      0.71      0.71      1148



## Predicting test data and making a sample submission file

In [37]:
test_directories = []
for i in glob("data/test/*/"):
    for j in glob(i+'*/'):
        test_directories.append(j)

In [38]:
test_directories

[]

In [39]:
test_data = []
for i in test_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))

In [40]:
test_tweetid_data = []
#for test
for i in range(len(labels), len(data)):
    for j in te_flatten(data[i]):
        test_tweetid_data.append(j)

In [41]:
test_df = pd.DataFrame(test_tweetid_data, columns = test_tweetid_data[0].keys(), index = None)

IndexError: list index out of range

In [None]:
test_df.head()

In [None]:
test_tweets = test_df.text
tweet_ids = test_df.tweet_id

In [None]:
cleaned_test = [clean_tweet(tweet) for tweet in test_tweets]

In [None]:
X_test = vectorizer.transform(cleaned_test)
X_test = X_test.todense()

In [None]:
submission_prediction = classifier.predict(X_test)
submission = {'tweet_id': tweet_ids, 'label':submission_prediction}
submission = pd.DataFrame(submission)

In [None]:
submission.to_csv('data/sample_submission.csv', index = False)