In [1]:
%cd /content/drive/MyDrive/HASOC2021-EnglishHindiMarathi/trainingSet

/content/drive/MyDrive/HASOC2021-EnglishHindiMarathi/trainingSet


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json
import tensorflow as tf

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout,BatchNormalization,GRU

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import stemmer as hindi_stemmer

In [4]:
english_stopwords = stopwords.words("english")
with open('final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])
stopwords = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

In [5]:
df=pd.read_excel("hi_Hasoc2021_train.xlsx")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,_id,tweet_id,text,task_1,task_2
0,998,60c5d7495659ea5e55df0b7b,hi_hasoc_2021_998,@rssurjewala #Hindus DYING #HindusLivesMatter ...,NOT,NONE
1,4049,60c5d7495659ea5e55df1b73,hi_hasoc_2021_4049,सब लोग इतने पैसे डोनेट ही कर रहे हैं फिर भी आम...,NOT,NONE
2,1757,60c5d7495659ea5e55df0d1b,hi_hasoc_2021_1757,शेर-ए- सिवान शहाबुद्दीन साहब से रिश्ता क्या ل...,NOT,NONE
3,5175,60c5d7495659ea5e55df0e45,hi_hasoc_2021_5178,@AskAnshul आसमानी किताब के नाजायज औलाद है।,HOF,OFFN
4,1825,60c5d7495659ea5e55df0ee1,hi_hasoc_2021_1825,@Shikha0222 इसे कहते हैं दोगला पंती जिस सपा की...,NOT,NONE


## **Text preprocessing**

In [7]:
tweets = df.text
y = df.task_1

In [8]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopwords:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [9]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

In [10]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()

## **Training and testing**

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## **Logistic regression**

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = classifier.predict(X_val)

In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.75      0.42      0.54       297
         NOT       0.77      0.93      0.84       622

    accuracy                           0.77       919
   macro avg       0.76      0.68      0.69       919
weighted avg       0.76      0.77      0.75       919



## **XGBoost**

In [32]:
from xgboost import XGBClassifier

In [41]:
model = XGBClassifier(earning_rate=0.1,n_estimators=1000,max_depth=15,min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,objective= 'binary:logistic',nthread=4,scale_pos_weight=1,seed=27)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, earning_rate=0.1,
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=4, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27,
              silent=None, subsample=0.8, verbosity=1)

In [42]:
y_pred = model.predict(X_val)

In [43]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.65      0.56      0.60       297
         NOT       0.80      0.86      0.83       622

    accuracy                           0.76       919
   macro avg       0.73      0.71      0.71       919
weighted avg       0.75      0.76      0.76       919



## **Neural Network**

In [None]:
le = LabelEncoder() #label encoding labels for training Dense Neural Network
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [None]:
model = Sequential(
    [
        Dense(128, activation="relu"),
        Dropout(0.8),
        Dense(32, activation="relu"),
        Dropout(0.8),
        Dense(32, activation="relu"),
        Dropout(0.8),
        Dense(16, activation="relu"),
        Dropout(0.6),
        Dense(8, activation="sigmoid"),
        Dropout(0.4),
        BatchNormalization(),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 30, batch_size = 32)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f6a347eb850>

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred)) 

In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.56      0.60       297
           1       0.80      0.85      0.82       622

    accuracy                           0.76       919
   macro avg       0.72      0.70      0.71       919
weighted avg       0.75      0.76      0.75       919



In [None]:
#model.save("nn1_78")

INFO:tensorflow:Assets written to: nn1_78/assets


In [None]:
nn1_78=tf.keras.models.load_model("nn1_78")

In [None]:
y_pred = nn1_78.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred)) 

In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.46      0.57       297
           1       0.78      0.93      0.85       622

    accuracy                           0.78       919
   macro avg       0.77      0.69      0.71       919
weighted avg       0.77      0.78      0.76       919



## **Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.36      0.81      0.50       297
         NOT       0.77      0.31      0.44       622

    accuracy                           0.47       919
   macro avg       0.57      0.56      0.47       919
weighted avg       0.64      0.47      0.46       919



## **K Nearest Neighbours**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.41      0.85      0.55       297
         NOT       0.85      0.43      0.57       622

    accuracy                           0.56       919
   macro avg       0.63      0.64      0.56       919
weighted avg       0.71      0.56      0.56       919



## **Decision Trees**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.60      0.57      0.58       297
         NOT       0.80      0.82      0.81       622

    accuracy                           0.74       919
   macro avg       0.70      0.69      0.69       919
weighted avg       0.73      0.74      0.73       919



## **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_classi = RandomForestClassifier()
rf_classi.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = rf_classi.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.72      0.49      0.58       297
         NOT       0.79      0.91      0.84       622

    accuracy                           0.77       919
   macro avg       0.75      0.70      0.71       919
weighted avg       0.77      0.77      0.76       919



## **Support Vector Machine**

In [None]:
from sklearn.svm import SVC

In [None]:
classifier = SVC()
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.75      0.43      0.55       297
         NOT       0.78      0.93      0.85       622

    accuracy                           0.77       919
   macro avg       0.76      0.68      0.70       919
weighted avg       0.77      0.77      0.75       919



## **Stochastic Gradient Descent**

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
classifier = SGDClassifier()
classifier.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.64      0.51      0.57       297
         NOT       0.79      0.86      0.82       622

    accuracy                           0.75       919
   macro avg       0.71      0.69      0.69       919
weighted avg       0.74      0.75      0.74       919



## **Ensembling-Voting**

In [None]:
# Importing Libraries

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Logistic Regression

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)


# Support Vector Machine

svc = SVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_val)

# Naive Bayes

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)

# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_val)

# K Nearest Neighbour

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_val)

# Decision Tree

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_val)

# Random Forest

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)

In [None]:
# Voting

y_pred = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    #predictions = [lr_pred[i], svc_pred[i], sgd_pred[i], dt_pred[i], rf_pred[i]]
    predictions = [lr_pred[i], svc_pred[i], nb_pred[i], sgd_pred[i], knn_pred[i], dt_pred[i], rf_pred[i]]
    # predictions = [lr_pred[i], nb_pred[i], sgd_pred[i], dt_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 'HOF': one += 1
        if pred == 'NOT': zero +=1
    if one > zero: y_pred.append('HOF')
    else: y_pred.append('NOT')

y_pred = np.array(y_pred)

In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.70      0.52      0.60       297
         NOT       0.80      0.89      0.84       622

    accuracy                           0.77       919
   macro avg       0.75      0.71      0.72       919
weighted avg       0.76      0.77      0.76       919



## **Loading test data**

In [16]:
%cd /content/drive/MyDrive/HASOC2021-EnglishHindiMarathi/testSet

/content/drive/MyDrive/HASOC2021-EnglishHindiMarathi/testSet


In [17]:
df_test=pd.read_csv("hi_Hasoc2021_test_task1.csv")

In [18]:
df_test.head()

Unnamed: 0,_id,tweet_id,text
0,60c5d7495659ea5e55df0546,hi_hasoc_2021_5,@hemantmkpandya @news24tvchannel @Aloksharmaai...
1,60c5d7495659ea5e55df0591,hi_hasoc_2021_7,वोडाफोन ने एक कुत्ता पाला था बहुत फेमस हुआ फ...
2,60c5d7495659ea5e55df0622,hi_hasoc_2021_12,18-18 घंटे दीमक ने जाकर 70 साल के मज़बूत पेड़ ...
3,60c5d7495659ea5e55df0666,hi_hasoc_2021_13,@dmfatehpur हमारे ग्राम पंचायत सिधांव जिला फते...
4,60c5d7495659ea5e55df067b,hi_hasoc_2021_15,यह मुझे चैन क्यों नहीं पड़ता एक ही शख़्स था जह...


In [19]:
test_tweets = df_test.text
tweet_ids = df_test.tweet_id

In [20]:
cleaned_test = [clean_tweet(tweet) for tweet in test_tweets]

In [21]:

X_test = vectorizer.transform(cleaned_test)
X_test = X_test.todense()


In [None]:
lr_pred = lr.predict(X_test)
svc_pred = svc.predict(X_test)
nb_pred = nb.predict(X_test)
sgd_pred = sgd.predict(X_test)
knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)

In [None]:
# Voting

y_test = []

for i in range(len(lr_pred)):
    one = 0
    zero = 0
    #predictions = [lr_pred[i], svc_pred[i], sgd_pred[i], dt_pred[i], rf_pred[i]]
    predictions = [lr_pred[i], svc_pred[i], nb_pred[i], sgd_pred[i], knn_pred[i], dt_pred[i], rf_pred[i]]
    # predictions = [lr_pred[i], nb_pred[i], sgd_pred[i], dt_pred[i], rf_pred[i]]
    for pred in predictions:
        if pred == 'HOF': one += 1
        if pred == 'NOT': zero +=1
    if one > zero: y_test.append('HOF')
    else: y_test.append('NOT')

y_test = np.array(y_test)

In [44]:
y_test=model.predict(X_test)

In [45]:
print(y_test)

['HOF' 'HOF' 'NOT' ... 'NOT' 'NOT' 'NOT']


In [46]:
id=df_test._id

In [47]:
submission = {'id':tweet_ids,'label':y_test}
submission = pd.DataFrame(submission)

In [48]:
submission.head()

Unnamed: 0,id,label
0,hi_hasoc_2021_5,HOF
1,hi_hasoc_2021_7,HOF
2,hi_hasoc_2021_12,NOT
3,hi_hasoc_2021_13,HOF
4,hi_hasoc_2021_15,NOT


In [49]:
submission.to_csv('hindi_xgb.csv', index = False)

In [None]:
y_test=nn1_78.predict(X_test)
y_test = (y_test > 0.5).astype('int64')
y_test = y_test.reshape(len(y_test)) 

In [None]:
y_test

array([0, 1, 1, ..., 1, 1, 1])

In [None]:
y_test=y_test.tolist()

for i in range(len(y_test)):
  if y_test[i]==0:
    y_test[i]="HOF"
  else:
    y_test[i]="NOT"
    

In [None]:
y_test=np.array(y_test)

In [None]:
y_test

array([0, 1, 1, ..., 1, 1, 1])

In [None]:
y_test

array(['HOF', 'NOT', 'NOT', ..., 'NOT', 'NOT', 'NOT'], dtype='<U3')

In [None]:
submission = {'id':tweet_ids,'label':y_test}
submission = pd.DataFrame(submission)

In [None]:
submission.head()

Unnamed: 0,id,label
0,hi_hasoc_2021_5,HOF
1,hi_hasoc_2021_7,NOT
2,hi_hasoc_2021_12,NOT
3,hi_hasoc_2021_13,NOT
4,hi_hasoc_2021_15,NOT


In [None]:
submission.to_csv('hindi_nn.csv', index = False)