In [None]:
# import all the required libraries.
# These includes libraries for preprocessing, and training the models
import re
import nltk
import keras
import emoji
import torch
import wordsegment
import numpy as np
import transformers
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
from keras.layers import Dense
from sklearn import naive_bayes
from keras.models import Sequential
from sklearn.metrics import f1_score
from wordsegment import load, segment
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
data = pd.read_csv('/content/drive/MyDrive/OLIDv1.0/olid-training-v1.0.tsv', sep='\t')

In [None]:
data_test_a = pd.read_csv('drive/My Drive/OLIDv1.0/testset-levela.tsv', sep='\t')
data_test_b = pd.read_csv('drive/My Drive/OLIDv1.0/testset-levelb.tsv', sep='\t')
data_test_c = pd.read_csv('drive/My Drive/OLIDv1.0/testset-levelc.tsv', sep='\t')

In [None]:
label_test_a = pd.read_csv('drive/My Drive/OLIDv1.0/labels-levela.csv', header=None)
label_test_b = pd.read_csv('drive/My Drive/OLIDv1.0/labels-levelb.csv', header=None)
label_test_c = pd.read_csv('drive/My Drive/OLIDv1.0/labels-levelc.csv', header=None)

In [None]:
# function for preprocessing the tweets
def t_preprocess(x):
    pred = []
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stopwords.update(['url'])
    txt = re.compile(r'[^a-zA-Z]')
    # Removing @ user tags from data
    usr_rmv = re.compile("@[A-Za-z0-9]+")
    # Removing # tags from data 
    hash_rmv = re.compile("#[A-Za-z0-9]+") 
    # Regex pattern for whole-word numbers
    number_pattern = re.compile(r'\b\d+\b')
    enti = []
    # segment the words with #. For eg. #nojustice becomes no justice
    load()
    for i in range(0, len(x)):
      print
      sent_tokens = x[i].split(' ')
      for j, t in enumerate(sent_tokens):
          if t.find('#') == 0:
              sent_tokens[j] = ' '.join(wordsegment.segment(t))
      x[i] = ' '.join(sent_tokens)
    
    # Loop through each tweet in the data list to do the preprocessing
    for j in range(0, len(x)):
        lines = x[j].split("\n")
        for i in range(0, len(lines)):
            lines[i] = usr_rmv.sub('', lines[i])
            # convert emojis to words
            lines[i] = emoji.demojize(lines[i])
            lines[i] = number_pattern.sub('', lines[i])
            filters='!"\'$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
            translate_dict = dict((c, " ") for c in filters)
            translate_map = str.maketrans(translate_dict)
            lines[i] = lines[i].translate(translate_map)
            lines[i] = txt.sub(' ', lines[i])
            lines[i] = lines[i].lower()
            # Remove short words
            lines[i] = ' '.join([w for w in lines[i].split() if len(w) > 2])
            # Remove stopwords
            lines[i] = ' '.join([w for w in lines[i].split() if w not in stopwords])
            # Remove extra spaces, just for beauty
            re.sub('\s\s+', " ", lines[i])
            lines[i] = " ".join(lines[i].split())

        pre = " ".join(lines)
        pred.append(pre)
    return pred

TASK A

In [None]:
# call to the preprocessing function.
data_pre = t_preprocess(data['tweet'].tolist())
data_pre_test = t_preprocess(data_test_a['tweet'].tolist())
y_true = label_test_a[1].tolist()

In [None]:
# get the vectors for the tweets in the train and test data using count vectorizer.
# here processed tweets are used.
vectorizer = CountVectorizer(ngram_range=(1,2))
vectorizer.fit(data_pre)

train_vectors = vectorizer.transform(data_pre)
test_vectors = vectorizer.transform(data_pre_test)

In [None]:
# training logistic regression model for subtask a on count vectorizer
model = LogisticRegression()
model.fit(train_vectors, data['subtask_a'].tolist())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# testing Logistic regression model on test data of subtask a on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for logistic regression model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.7110804097871795

In [None]:
# training naive bayes model for subtask a on count vectorizer
model = naive_bayes.MultinomialNB()
model.fit(train_vectors, data['subtask_a'].tolist())

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# testing naive bayes model on test data of subtask a on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for naive bayes model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.6869907602716496

In [None]:
# training Random Forest model for subtask a on count vectorizer
model = RandomForestClassifier()
model.fit(train_vectors, data['subtask_a'].tolist())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# testing Random Forest model on test data of subtask a on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for Random Forest model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.7209965149443407

In [None]:
# training XGBoost model for subtask a on count vectorizer
model = xgb.XGBClassifier()
model.fit(train_vectors, data['subtask_a'].tolist())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# testing XGBoost model on test data of subtask a on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for XGBoost model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.6457193500475417

In [None]:
# get the vectors for the tweets in the train and test data using tfidf vectorizer.
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit(data_pre)

train_vectors = vectorizer.transform(data_pre)
test_vectors = vectorizer.transform(data_pre_test)

In [None]:
# training logistic regression model for subtask a on tfidf vectorizer
model = LogisticRegression()
model.fit(train_vectors, data['subtask_a'].tolist())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# testing Logistic regression model on test data of subtask a on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for logistic regression model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.6822019869317903

In [None]:
# training naive bayes model for subtask a on tfidf vectorizer
model = naive_bayes.MultinomialNB()
model.fit(train_vectors, data['subtask_a'].tolist())

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# testing naive bayes model on test data of subtask a on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for naive bayes model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.500204190619797

In [None]:
# training Random Forest model for subtask a on tfidf vectorizer
model = RandomForestClassifier()
model.fit(train_vectors, data['subtask_a'].tolist())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# testing Random Forest model on test data of subtask a on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for Random Forest model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.718479924886163

In [None]:
# training XGBoost model for subtask a on tfidf vectorizer
model = xgb.XGBClassifier()
model.fit(train_vectors, data['subtask_a'].tolist())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# testing XGBoost model on test data of subtask a on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for XGBoost model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.6418892889669746

In [None]:
# get word embeddings of the tweets using distil bert for training set
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

feat_train = []
for i in tqdm(range(len(data_pre))):
    inputs = tokenizer(data_pre[i], return_tensors="pt")
    outputs = model(**inputs)
    feat.append(outputs)

np.save('distilbert_features.npy', feat_train)

In [None]:
# get word embeddings of the tweets using distil bert for test set
feat_test = []
for i in tqdm(range(len(data_pre_test))):
    inputs = tokenizer(data_pre_test[i], return_tensors="pt")
    outputs = model(**inputs)
    feat.append(outputs)

np.save('distilbert_testa_features.npy', feat_test)

In [None]:
feat = np.load("/content/drive/MyDrive/OLIDv1.0/distilbert_features.npy",allow_pickle=True)
feat_test = np.load("/content/drive/MyDrive/OLIDv1.0/distilbert_testa_features.npy",allow_pickle=True)

In [None]:
train_vectors = []
for i in range(len(feat)):
  train_vectors.append(feat[i][0][0][0].detach().numpy())

test_vectors = []
for i in range(len(feat_test)):
  test_vectors.append(feat_test[i][0][0][0].detach().numpy())

In [None]:
y = pd.factorize(data['subtask_a'])[0]

In [None]:
# define the keras model
model = Sequential()
model.add(Dense(2480, input_dim=768, activation='relu'))
model.add(Dense(1260, activation='relu'))
model.add(Dense(640, activation='relu'))
model.add(Dense(320, activation='relu'))
model.add(Dense(160, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# compile keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_58 (Dense)             (None, 2480)              1907120   
_________________________________________________________________
dense_59 (Dense)             (None, 1260)              3126060   
_________________________________________________________________
dense_60 (Dense)             (None, 640)               807040    
_________________________________________________________________
dense_61 (Dense)             (None, 320)               205120    
_________________________________________________________________
dense_62 (Dense)             (None, 160)               51360     
_________________________________________________________________
dense_63 (Dense)             (None, 64)                10304     
_________________________________________________________________
dense_64 (Dense)             (None, 32)               

In [None]:
# reshape the training embeddings obtained
a = np.array(train_vectors)
train_vec = np.vstack(a)

In [None]:
# train the model
model.fit(train_vec, y, epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5d0fb26d10>

In [None]:
# reshape test embeddings
a = np.array(test_vectors)
test_vec = np.vstack(a)

In [None]:
# get test labels in binary form
y_test = pd.factorize(y_true)[0]

In [None]:
# get predictions for test set of subtask a
y_pred = model.predict(test_vec)

In [None]:
res=[]
for prediction in y_pred:
    if prediction[0]<0.5:
        res.append(0)
    else:
        res.append(1)

In [None]:
# calculate f1 score for subtask a
f1_score(y_test, res, average='macro')

0.7116849868947721

In [None]:
# define the keras model
model = Sequential()
model.add(Dense(2480, input_dim=768, activation='relu'))
model.add(Dense(1260, activation='relu'))
model.add(Dense(640, activation='relu'))
model.add(Dense(160, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# compile the model
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_68 (Dense)             (None, 2480)              1907120   
_________________________________________________________________
dense_69 (Dense)             (None, 1260)              3126060   
_________________________________________________________________
dense_70 (Dense)             (None, 640)               807040    
_________________________________________________________________
dense_71 (Dense)             (None, 160)               102560    
_________________________________________________________________
dense_72 (Dense)             (None, 64)                10304     
_________________________________________________________________
dense_73 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_74 (Dense)             (None, 16)               

In [None]:
# train the model
model.fit(train_vec, y, epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5d0b3b9e50>

In [None]:
# get test predictions for subtask a
y_pred = model.predict(test_vec)

In [None]:
res=[]
for prediction in y_pred:
    if prediction[0]<0.5:
        res.append(0)
    else:
        res.append(1)

In [None]:
# get f1 score
f1_score(y_test, res, average='macro')

0.6957808078209416

TASK B

In [None]:
# load data for training
data = pd.read_csv('/content/drive/MyDrive/OLIDv1.0/olid-training-v1.0.tsv', sep='\t')
data = data[data['subtask_b'].notna()]

In [None]:
# call to the preprocessing function.
data_pre = t_preprocess(data['tweet'].tolist())
data_pre_test = t_preprocess(data_test_b['tweet'].tolist())
y_true = label_test_b[1].tolist()

In [None]:
# get the vectors for the tweets in the train and test data using count vectorizer.
# here processed tweets are used.
vectorizer = CountVectorizer(ngram_range=(1,2))
vectorizer.fit(data_pre)

train_vectors = vectorizer.transform(data_pre)
test_vectors = vectorizer.transform(data_pre_test)

In [None]:
# training logistic regression model for subtask b on count vectorizer
model = LogisticRegression()
model.fit(train_vectors, data['subtask_b'].tolist())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# testing Logistic regression model on test data of subtask b on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask b for logistic regression model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.4678492239467849

In [None]:
# training naive bayes model for subtask b on count vectorizer
model = naive_bayes.MultinomialNB()
model.fit(train_vectors, data['subtask_b'].tolist())

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# testing naive bayes model on test data of subtask b on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask b for naive bayes model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.4690265486725663

In [None]:
# training Random Forest model for subtask b on count vectorizer
model = RandomForestClassifier()
model.fit(train_vectors, data['subtask_b'].tolist())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# testing Random Forest model on test data of subtask b on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask b for Random Forest model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.5607077486272117

In [None]:
# training XGBoost model for subtask b on count vectorizer
model = xgb.XGBClassifier()
model.fit(train_vectors, data['subtask_b'].tolist())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# testing XGBoost model on test data of subtask b on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask b for XGBoost model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.47019867549668876

In [None]:
# get the vectors for the tweets in the train and test data using tfidf vectorizer.
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit(data_pre)

train_vectors = vectorizer.transform(data_pre)
test_vectors = vectorizer.transform(data_pre_test)

In [None]:
# training logistic regression model for subtask b on tfidf vectorizer
model = LogisticRegression()
model.fit(train_vectors, data['subtask_b'].tolist())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# testing Logistic regression model on test data of subtask b on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask b for logistic regression model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.4690265486725663

In [None]:
# training naive bayes model for subtask b on tfidf vectorizer
model = naive_bayes.MultinomialNB()
model.fit(train_vectors, data['subtask_b'].tolist())

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# testing naive bayes model on test data of subtask b on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask b for naive bayes model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.47019867549668876

In [None]:
# training Random Forest model for subtask b on tfidf vectorizer
model = RandomForestClassifier()
model.fit(train_vectors, data['subtask_b'].tolist())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# testing Random Forest model on test data of subtask b on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask b for Random Forest model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.4678492239467849

In [None]:
# training XGBoost model for subtask b on tfidf vectorizer
model = xgb.XGBClassifier()
model.fit(train_vectors, data['subtask_b'].tolist())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# testing XGBoost model on test data of subtask b on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask b for XGBoost model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.4690265486725663

In [None]:
# get word embeddings of the tweets using distil bert for training set
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

feat_train = []
for i in tqdm(range(len(data_pre))):
    inputs = tokenizer(data_pre[i], return_tensors="pt")
    outputs = model(**inputs)
    feat.append(outputs)

np.save('distilbert_features.npy', feat_train)

In [None]:
# get word embeddings of the tweets using distil bert for test set
feat_test = []
for i in tqdm(range(len(data_pre_test))):
    inputs = tokenizer(data_pre_test[i], return_tensors="pt")
    outputs = model(**inputs)
    feat.append(outputs)

np.save('distilbert_testb_features.npy', feat_test)

In [None]:
feat = np.load("/content/drive/MyDrive/OLIDv1.0/distilbert_features.npy",allow_pickle=True)
feat_test = np.load("/content/drive/MyDrive/OLIDv1.0/distilbert_testb_features.npy",allow_pickle=True)

In [None]:
data = pd.read_csv('/content/drive/MyDrive/OLIDv1.0/olid-training-v1.0.tsv', sep='\t')
data['bert'] = feat

In [None]:
# remove tweets with Null label
data = data[data['subtask_b'].notna()]

In [None]:
feat = data['bert'].tolist()

In [None]:
train_vectors = []
for i in range(len(feat)):
  train_vectors.append(feat[i][0][0][0].detach().numpy())

test_vectors = []
for i in range(len(feat_test)):
  test_vectors.append(feat_test[i][0][0][0].detach().numpy())

In [None]:
y = pd.factorize(data['subtask_b'])[0]

In [None]:
# define the keras model
model = Sequential()
model.add(Dense(2480, input_dim=768, activation='relu'))
model.add(Dense(1260, activation='relu'))
model.add(Dense(640, activation='relu'))
model.add(Dense(320, activation='relu'))
model.add(Dense(160, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# reshape the vector
a = np.array(train_vectors)
train_vec = np.vstack(a)

In [None]:
# train the model
model.fit(train_vec, y, epochs=25, batch_size=64)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f5d0fb75190>

In [None]:
# reshape test embeddings of task b
a = np.array(test_vectors)
test_vec = np.vstack(a)

In [None]:
# get predictions for task b
y_pred = model.predict(test_vec)

In [None]:
y_test = pd.factorize(y_true)[0]

In [None]:
res=[]
for prediction in y_pred:
    if prediction[0]<0.5:
        res.append(1)
    else:
        res.append(1)

In [None]:
# get f1 score for task b
f1_score(y_test, res, average='macro')

0.12402669632925473

In [None]:
# define the keras model
model = Sequential()
model.add(Dense(2480, input_dim=768, activation='relu'))
model.add(Dense(1260, activation='relu'))
model.add(Dense(640, activation='relu'))
model.add(Dense(160, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# compile the model
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
# train the model
model.fit(train_vec, y, epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5d0b1c7450>

In [None]:
# get predictions for subtask b
y_pred = model.predict(test_vec)

In [None]:
res=[]
for prediction in y_pred:
    if prediction[0]<0.5:
        res.append(1)
    else:
        res.append(0)

In [None]:
# get f1 score of subtask b
f1_score(y_test, res, average='macro')

0.47019867549668876

TASK C

In [None]:
# load data
data = pd.read_csv('/content/drive/MyDrive/OLIDv1.0/olid-training-v1.0.tsv', sep='\t')
data = data[data['subtask_c'].notna()]

In [None]:
# call to the preprocessing function.
data_pre = t_preprocess(data['tweet'].tolist())
data_pre_test = t_preprocess(data_test_c['tweet'].tolist())
y_true = label_test_c[1].tolist()

In [None]:
# get the vectors for the tweets in the train and test data using count vectorizer.
# here processed tweets are used.
vectorizer = CountVectorizer(ngram_range=(1,2))
vectorizer.fit(data_pre)

train_vectors = vectorizer.transform(data_pre)
test_vectors = vectorizer.transform(data_pre_test)

In [None]:
# training logistic regression model for subtask c on count vectorizer
model = LogisticRegression()
model.fit(train_vectors, data['subtask_c'].tolist())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# testing Logistic regression model on test data of subtask c on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask c for logistic regression model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.4940509657517045

In [None]:
# training naive bayes model for subtask c on count vectorizer
model = naive_bayes.MultinomialNB()
model.fit(train_vectors, data['subtask_c'].tolist())

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# testing naive bayes model on test data of subtask c on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask c for naive bayes model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.4072622240561172

In [None]:
# training Random Forest model for subtask c on count vectorizer
model = RandomForestClassifier()
model.fit(train_vectors, data['subtask_c'].tolist())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# testing Random Forest model on test data of subtask c on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask c for Random Forest model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.4783515323275263

In [None]:
# training XGBoost model for subtask c on count vectorizer
model = xgb.XGBClassifier()
model.fit(train_vectors, data['subtask_c'].tolist())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# testing XGBoost model on test data of subtask c on count vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask a for XGBoost model on count vectorizer
f1_score(y_true, y_pred, average='macro')

0.4549436212841302

In [None]:
# get the vectors for the tweets in the train and test data using tfidf vectorizer.
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit(data_pre)

train_vectors = vectorizer.transform(data_pre)
test_vectors = vectorizer.transform(data_pre_test)

In [None]:
# training logistic regression model for subtask c on tfidf vectorizer
model = LogisticRegression()
model.fit(train_vectors, data['subtask_c'].tolist())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# testing Logistic regression model on test data of subtask c on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask c for logistic regression model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.43885775825803086

In [None]:
# training naive bayes model for subtask c on tfidf vectorizer
model = naive_bayes.MultinomialNB()
model.fit(train_vectors, data['subtask_c'].tolist())

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# testing naive bayes model on test data of subtask c on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask c for naive bayes model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.21436227224008575

In [None]:
# training Random Forest model for subtask c on tfidf vectorizer
model = RandomForestClassifier()
model.fit(train_vectors, data['subtask_c'].tolist())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# testing Random Forest model on test data of subtask c on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask c for Random Forest model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.4841138659320477

In [None]:
# training XGBoost model for subtask c on tfidf vectorizer
model = xgb.XGBClassifier()
model.fit(train_vectors, data['subtask_c'].tolist())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# testing XGBoost model on test data of subtask c on tfidf vectorizer
y_pred = model.predict(test_vectors)

In [None]:
# get f1-score for subtask c for XGBoost model on tfidf vectorizer
f1_score(y_true, y_pred, average='macro')

0.45999439304737866

In [None]:
# get word embeddings of the tweets using distil bert for training set
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

feat_train = []
for i in tqdm(range(len(data_pre))):
    inputs = tokenizer(data_pre[i], return_tensors="pt")
    outputs = model(**inputs)
    feat.append(outputs)

np.save('distilbert_features.npy', feat_train)

In [None]:
# get word embeddings of the tweets using distil bert for test set
feat_test = []
for i in tqdm(range(len(data_pre_test))):
    inputs = tokenizer(data_pre_test[i], return_tensors="pt")
    outputs = model(**inputs)
    feat.append(outputs)

np.save('distilbert_testc_features.npy', feat_test)

In [None]:
feat = np.load("/content/drive/MyDrive/OLIDv1.0/distilbert_features.npy",allow_pickle=True)
feat_test = np.load("/content/drive/MyDrive/OLIDv1.0/distilbert_testc_features.npy",allow_pickle=True)

In [None]:
data = pd.read_csv('/content/drive/MyDrive/OLIDv1.0/olid-training-v1.0.tsv', sep='\t')
data['bert'] = feat

In [None]:
# remove null values
data = data[data['subtask_c'].notna()]

In [None]:
feat = data['bert'].tolist()

In [None]:
train_vectors = []
for i in range(len(feat)):
  train_vectors.append(feat[i][0][0][0].detach().numpy())

test_vectors = []
for i in range(len(feat_test)):
  test_vectors.append(feat_test[i][0][0][0].detach().numpy())

In [None]:
y = pd.factorize(data['subtask_c'])[0]

In [None]:
# define the keras model
model = Sequential()
model.add(Dense(2480, input_dim=768, activation='relu'))
model.add(Dense(1260, activation='relu'))
model.add(Dense(640, activation='relu'))
model.add(Dense(320, activation='relu'))
model.add(Dense(160, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [None]:
# compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# reshape word embeddings of training set
a = np.array(train_vectors)
train_vec = np.vstack(a)

In [None]:
# train model
model.fit(train_vec, y, epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5d14bd9e50>

In [None]:
# reshape test embeddings
a = np.array(test_vectors)
test_vec = np.vstack(a)

In [None]:
# predict labels for task c
y_pred = model.predict(test_vec)

In [None]:
y_test = pd.factorize(y_true)[0]

In [None]:
res=[]
for prediction in y_pred:
  max_value = prediction.argmax(axis=0)
  res.append(max_value)

In [None]:
# get f1 score
f1_score(y_test, res, average='macro')

0.2514942974378857

In [None]:
# define the keras model
model = Sequential()
model.add(Dense(2480, input_dim=768, activation='relu'))
model.add(Dense(1260, activation='relu'))
model.add(Dense(640, activation='relu'))
model.add(Dense(160, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(3, activation='sigmoid'))

In [None]:
# compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
# train the model
model.fit(train_vec, y, epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f5d0b6e4d50>

In [None]:
# get predictions for task c
y_pred = model.predict(test_vec)

In [None]:
res=[]
for prediction in y_pred:
  max_value = prediction.argmax(axis=0)
  res.append(max_value)

In [None]:
# get f1 score for task c
f1_score(y_test, res, average='macro')

0.15476190476190477