# **SUBTASK 1 - ENGLISH**

## Environment Setup

In [1]:
% cd /content/drive/MyDrive/Colab Notebooks/HASOC2021-EnglishHindiMarathi 
! pip install -q transformers

/content/drive/MyDrive/Colab Notebooks/HASOC2021-EnglishHindiMarathi
[K     |████████████████████████████████| 2.6 MB 12.3 MB/s 
[K     |████████████████████████████████| 636 kB 66.0 MB/s 
[K     |████████████████████████████████| 895 kB 61.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 61.4 MB/s 
[?25h

## Importing Libraries

In [2]:
import nltk
nltk.download('stopwords')

import pandas as pd
import numpy as np
from glob import glob
import re
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import transformers

import torch

import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Initialising Stopwords and Stemmers

In [3]:
english_stopwords = stopwords.words("english")
with open('final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])
stopwords = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

## Reading Training Data

In [4]:
df=pd.read_excel("trainingSet/en_Hasoc2021_train.xlsx")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,_id,text,task_1,task_2
0,4986,60c5d6bf5659ea5e55defa2c,@wealth if you made it through this &amp;&amp;...,HOF,PRFN
1,3394,60c5d6bf5659ea5e55def461,Technically that's still turning back the cloc...,HOF,OFFN
2,1310,60c5d6bf5659ea5e55defaad,@VMBJP @BJP4Bengal @BJP4India @narendramodi @J...,NOT,NONE
3,3390,60c5d6bf5659ea5e55def419,@krtoprak_yigit Soldier of Japan Who has dick ...,HOF,OFFN
4,4626,60c5d6bf5659ea5e55def7fa,@blueheartedly You'd be better off asking who ...,HOF,OFFN


In [6]:
tweets = df.text
y = df.task_1

## Pre-Processing

In [7]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [8]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

## Featuring Raw Text

In [9]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()

## Test Train Split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.80      0.92      0.86       529
         NOT       0.73      0.50      0.59       240

    accuracy                           0.79       769
   macro avg       0.77      0.71      0.72       769
weighted avg       0.78      0.79      0.77       769



## Ensembling - Voting

In [None]:
# Importing Libraries

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)

In [None]:
print(classification_report(y_val, lr_pred))

              precision    recall  f1-score   support

         HOF       0.80      0.92      0.86       529
         NOT       0.73      0.50      0.59       240

    accuracy                           0.79       769
   macro avg       0.77      0.71      0.72       769
weighted avg       0.78      0.79      0.77       769



In [None]:
# Support Vector Machine

svc = SVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_val)

In [None]:
print(classification_report(y_val, svc_pred))

              precision    recall  f1-score   support

         HOF       0.80      0.93      0.86       529
         NOT       0.75      0.50      0.60       240

    accuracy                           0.79       769
   macro avg       0.78      0.71      0.73       769
weighted avg       0.79      0.79      0.78       769



In [None]:
# Naive Bayes

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)

In [None]:
print(classification_report(y_val, nb_pred))

              precision    recall  f1-score   support

         HOF       0.84      0.41      0.55       529
         NOT       0.39      0.83      0.53       240

    accuracy                           0.54       769
   macro avg       0.62      0.62      0.54       769
weighted avg       0.70      0.54      0.55       769



In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)

In [None]:
print(classification_report(y_val, sgd_pred))

              precision    recall  f1-score   support

         HOF       0.82      0.85      0.83       529
         NOT       0.64      0.59      0.61       240

    accuracy                           0.77       769
   macro avg       0.73      0.72      0.72       769
weighted avg       0.76      0.77      0.77       769



In [None]:
# K Nearest Neighbour

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)

In [None]:
print(classification_report(y_val, knn_pred))

              precision    recall  f1-score   support

         HOF       0.91      0.14      0.25       529
         NOT       0.34      0.97      0.50       240

    accuracy                           0.40       769
   macro avg       0.63      0.56      0.37       769
weighted avg       0.74      0.40      0.33       769



In [None]:
# Decision Tree

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)

In [None]:
print(classification_report(y_val, dt_pred))

              precision    recall  f1-score   support

         HOF       0.81      0.75      0.78       529
         NOT       0.53      0.61      0.57       240

    accuracy                           0.71       769
   macro avg       0.67      0.68      0.67       769
weighted avg       0.72      0.71      0.71       769



In [None]:
# Random Forest

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)

In [None]:
print(classification_report(y_val, rf_pred))

              precision    recall  f1-score   support

         HOF       0.83      0.86      0.85       529
         NOT       0.67      0.62      0.65       240

    accuracy                           0.79       769
   macro avg       0.75      0.74      0.75       769
weighted avg       0.78      0.79      0.78       769



In [None]:
# Voting

y_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    predictions = [lr_pred[i], svc_pred[i], nb_pred[i], sgd_pred[i], knn_pred[i], dt_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 'HOF': one += 1
        if pred == 'NOT': zero +=1
    if one > zero: y_pred.append('HOF')
    else: y_pred.append('NOT')

y_pred = np.array(y_pred)

In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.83      0.86      0.85       529
         NOT       0.67      0.62      0.64       240

    accuracy                           0.79       769
   macro avg       0.75      0.74      0.75       769
weighted avg       0.78      0.79      0.78       769



In [None]:
# Voting only using lr, svc & sgd as rest have bad scores

y_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    predictions = [lr_pred[i], svc_pred[i], sgd_pred[i]]
    for pred in predictions:
        if pred == 'HOF': one += 1
        if pred == 'NOT': zero +=1
    if one > zero: y_pred.append('HOF')
    else: y_pred.append('NOT')

y_pred = np.array(y_pred)

In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.81      0.92      0.86       529
         NOT       0.74      0.52      0.61       240

    accuracy                           0.79       769
   macro avg       0.77      0.72      0.73       769
weighted avg       0.79      0.79      0.78       769



## Label Encoding

In [11]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

## Scaling Features

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)

## Neural Network 1

In [None]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 5, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f51f57a6dd0>

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80       529
           1       0.57      0.59      0.58       240

    accuracy                           0.73       769
   macro avg       0.69      0.69      0.69       769
weighted avg       0.74      0.73      0.73       769



## Neural Network 2

In [13]:
class MyThresholdCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold):
        super(MyThresholdCallback, self).__init__()
        self.threshold = threshold
 
    def on_epoch_end(self, epoch, logs=None): 
        val_acc = logs["val_accuracy"]
        if val_acc >= self.threshold:
            self.model.stop_training = True

In [14]:
callback = MyThresholdCallback(threshold=0.79)

In [21]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dropout(0.8),
        Dense(32, activation="relu"),
        Dropout(0.6),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [22]:
model.fit(X_train, y_train, epochs = 1000, batch_size = 64, validation_data=(X_val, y_val), callbacks=[callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000


<keras.callbacks.History at 0x7f35312666d0>

In [23]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))

In [24]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       529
           1       0.76      0.50      0.60       240

    accuracy                           0.79       769
   macro avg       0.78      0.71      0.73       769
weighted avg       0.79      0.79      0.78       769



## Neural Network 3

In [None]:
callback = MyThresholdCallback(threshold=0.79)

In [None]:
model = Sequential(
    [
        # Dense(16, activation="relu"),
        # Dropout(0.8),
        # Dense(16, activation="relu"),
        # Dropout(0.6),
        Dense(4, activation="sigmoid"),
        Dropout(0.6),
        Dense(4, activation="sigmoid"),
        Dropout(0.6),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 1000, batch_size = 64, validation_data=(X_val, y_val), callbacks=[callback])

Epoch 1/1000


Exception ignored in: <function IteratorResourceDeleter.__del__ at 0x7f520f001c20>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 546, in __del__
    handle=self._handle, deleter=self._deleter)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1264, in delete_iterator
    _ctx, "DeleteIterator", name, handle, deleter)
KeyboardInterrupt: 


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


<keras.callbacks.History at 0x7f51d5266790>

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))

In [None]:
print(classification_report(y_val, y_pred))

## Distilbert Base Uncased Finetuned SST 2 English

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
tokenized_input = tokenizer(
    cleaned_tweets,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

In [None]:
tokenized_input['input_ids'] = tokenized_input['input_ids'].cpu().detach().numpy()
tokenized_input['attention_mask'] = tokenized_input['attention_mask'].cpu().detach().numpy()

In [None]:
output = []

j = 1

for i in range(len(cleaned_tweets)):
    
    input = {}
    input['input_ids'] = torch.from_numpy(tokenized_input['input_ids'][i].reshape(1, 99))
    input['attention_mask'] = torch.from_numpy(tokenized_input['attention_mask'][i].reshape(1, 99))

    model_output = model(**input)
    # model_output = model_output.last_hidden_state.cpu().detach().numpy()
    output.append(model_output)
    if ((i + 1) % 200) == 0:
        file_name = "./embed_feat/output" + str(j) + ".txt"
        with open(file_name, "wb") as fp:
            pickle.dump(output, fp)
        print(file_name + " done")
        output = []
        j += 1
        
file_name = "./embed_feat/output" + str(j) + ".txt"
with open(file_name, "wb") as fp:
    pickle.dump(output, fp)
print(file_name + " done")
output = []

./embed_feat/output1.txt done
./embed_feat/output2.txt done
./embed_feat/output3.txt done
./embed_feat/output4.txt done
./embed_feat/output5.txt done
./embed_feat/output6.txt done
./embed_feat/output7.txt done
./embed_feat/output8.txt done
./embed_feat/output9.txt done
./embed_feat/output10.txt done
./embed_feat/output11.txt done
./embed_feat/output12.txt done
./embed_feat/output13.txt done
./embed_feat/output14.txt done
./embed_feat/output15.txt done
./embed_feat/output16.txt done
./embed_feat/output17.txt done
./embed_feat/output18.txt done
./embed_feat/output19.txt done
./embed_feat/output20.txt done


In [None]:
output = []

for i in range(20):
    file_name = "./embed_feat/output" + str(i + 1) + ".txt"
    with open(file_name, "rb") as fp:   #Pickling
        file_output = pickle.load(fp)
        for x in file_output:
            output.append(x)
    print(file_name + " done")

./embed_feat/output1.txt done
./embed_feat/output2.txt done
./embed_feat/output3.txt done
./embed_feat/output4.txt done
./embed_feat/output5.txt done
./embed_feat/output6.txt done
./embed_feat/output7.txt done
./embed_feat/output8.txt done
./embed_feat/output9.txt done
./embed_feat/output10.txt done
./embed_feat/output11.txt done
./embed_feat/output12.txt done
./embed_feat/output13.txt done
./embed_feat/output14.txt done
./embed_feat/output15.txt done
./embed_feat/output16.txt done
./embed_feat/output17.txt done
./embed_feat/output18.txt done
./embed_feat/output19.txt done
./embed_feat/output20.txt done


In [None]:
X = output
output = []

In [None]:
len(X)

3843

In [None]:
for i in range(len(X)):
    X[i] = X[i].logits.cpu().detach().numpy()
X = np.array(X)

In [None]:
X = X.reshape(3843, 2)

In [None]:
with open("./embed_label/label.txt", "wb") as fp:
    pickle.dump(y, fp)

In [None]:
with open('./embed_label/label.txt', "rb") as fp:
        y = pickle.load(fp)

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = Sequential(
    [
        Dense(512, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 50, batch_size = 64, validation_data=(X_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f889ceaf410>

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       240
           1       0.69      1.00      0.82       529

    accuracy                           0.69       769
   macro avg       0.34      0.50      0.41       769
weighted avg       0.47      0.69      0.56       769



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [25]:
% cd /content/drive/MyDrive/Colab Notebooks/HASOC2021-EnglishHindiMarathi/testSet

/content/drive/MyDrive/Colab Notebooks/HASOC2021-EnglishHindiMarathi/testSet


In [26]:
df_test=pd.read_csv("en_Hasoc2021_test_task1.csv")

In [27]:
df_test.head()


Unnamed: 0,_id,text
0,60c5d6bf5659ea5e55deffcb,Fewer people coming in for vaccinations. So sa...
1,60c5d6bf5659ea5e55df028c,@MattHancock This may all be true. But... What...
2,60c5d6bf5659ea5e55def377,@Layla_EFC I’ve unfollowed him the wanker
3,60c5d6bf5659ea5e55def4c7,You guys are losing it all over the world. The...
4,60c5d6bf5659ea5e55df01a6,"And thus death laughs... It is sad merriment, ..."


In [28]:
test_tweets = df_test.text
tweet_ids = df_test._id

In [29]:
cleaned_test = [clean_tweet(tweet) for tweet in test_tweets]

In [30]:
X_test = vectorizer.transform(cleaned_test)
X_test = X_test.todense()

In [31]:
y_test=model.predict(X_test)
y_test = (y_test > 0.5).astype('int64')
y_test = y_test.reshape(len(y_test)) 

In [32]:
y_test


array([1, 0, 0, ..., 0, 1, 0])

In [33]:
y_test=y_test.tolist()

for i in range(len(y_test)):
  if y_test[i]==0:
    y_test[i]="HOF"
  else:
    y_test[i]="NOT"

In [34]:
y_test=np.array(y_test)

In [35]:
y_test

array(['NOT', 'HOF', 'HOF', ..., 'HOF', 'NOT', 'HOF'], dtype='<U3')

In [36]:
submission = {'id':tweet_ids,'label':y_test}
submission = pd.DataFrame(submission)

In [37]:
submission.head()

Unnamed: 0,id,label
0,60c5d6bf5659ea5e55deffcb,NOT
1,60c5d6bf5659ea5e55df028c,HOF
2,60c5d6bf5659ea5e55def377,HOF
3,60c5d6bf5659ea5e55def4c7,HOF
4,60c5d6bf5659ea5e55df01a6,NOT


In [38]:
submission.to_csv('eng_nn.csv', index = False)