In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import argparse
import pandas as pd 
import numpy as np
import time as t
import re
from nltk.stem import PorterStemmer
from sklearn.utils import resample, shuffle
pd.set_option('mode.chained_assignment', None)

import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

import spacy
nlp = spacy.load("en_core_web_sm")

ps = PorterStemmer()

In [3]:

test_df = pd.read_csv('/content/gdrive/MyDrive/Data/test.tsv',sep = "\t", header = 0)

In [4]:
print("Dataset size:", len(test_df))

Dataset size: 859


In [5]:

test_df.columns = ['tweet', 'task']

In [None]:
test_df.head()

Unnamed: 0,tweet,task
0,"#ConstitutionDay is revered by Conservatives, ...",NOT
1,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT
2,#Watching #Boomer getting the news that she is...,NOT
3,#NoPasaran: Unity demo to oppose the far-right...,OFF
4,. . . What the fuck did he do this time?,OFF


In [6]:
def apply_stem(tweet):
	words = [ps.stem(w) for w in tweet.split()]
	return ' '.join(words)

def remove_newline(tweet):
	_tweet = re.sub('\n', '', tweet)
	return _tweet

	
def apply_lemma(tweet):
	doc = nlp(tweet)
	lemma = [token.lemma_ for token in doc]
	return ' '.join(lemma)
 
def remove_stopwords(tweet):
	doc = nlp(tweet)
	stop = [token.text for token in doc if not token.is_stop and not token.is_punct]
	return ' '.join(stop)
 	

def clean_data(tweet):
    splitted_tweet = tweet.lower().split()
    clean_tweet = []
    previous_word = None
    user_count = 0
    for word in splitted_tweet:
        #if word not in spacy_stopwords:
        word = re.sub("[#@]","",word)
        word = re.sub("!"," !",word)
        word = re.sub("[?]"," ?",word)
        
        if(word == "user"):
          user_count += 1
          
        if(word == "user" and previous_word == "user"):
          pass
        else:
          clean_tweet.append(word)
          
        previous_word = word
          
    return " ".join(clean_tweet), user_count

# Calculating number of Global Positioning Entity in a text
def count_gpe(txt):
	return sum([1 for token in nlp(txt).ents if token.label_ == 'GPE'])


# Claculating Number of Organisation in a Text
def count_org(txt):
	return sum([1 for token in nlp(txt).ents if token.label_ == 'ORG'])


# Calculating Number of Sentence in a text
def count_sentence(txt):
	doc = nlp(txt)
	return len([sent.text for sent in doc.sents])


#Extract only Noun and Proper Noun
def extract_noun(tweet):
	doc = nlp(tweet)
	cleaned_doc = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and (token.pos_ == 'NOUN' or token.pos_ == 'PROPN')]
	return ' '.join(cleaned_doc)


# Adding Pos Tag with the corresponding words
def spacy_pos(tweet):
	doc = nlp(tweet)
	cleaned = [token.lemma_ + '_' + token.pos_ for token in doc if not token.is_stop and not token.is_punct]
	return ' '.join(cleaned)


#Normalize the Custom Features
def normalize(df):
	df['sentence_count'] /= df['sentence_count'].max()
	df['gpe_count'] /= df['gpe_count'].max()
	df['org_count'] /= df['org_count'].max()
	
	return df


In [None]:
if __name__ == "__main__":
  test_df = test_df.merge(test_df.tweet.apply(lambda x:pd.Series({'preprocessed':clean_data(x)[0], 'user_count': clean_data(x)[1]})), left_index=True, right_index=True)
  test_df.drop('user_count', inplace=True, axis=1)
  start = t.time()
  test_df['gpe_count'] = [sum([1 for token in nlp(txt).ents if token.label_ == 'GPE']) for txt in test_df['tweet']]
  stop = t.time()
  print("\n Count GPE Time for test set: {}".format(stop - start))
  start = t.time()
  test_df['org_count'] = [sum([1 for token in nlp(txt).ents if token.label_ == 'ORG']) for txt in test_df['tweet']]
  stop = t.time()
  print("\n Count Name_entity Time for test set: {}".format(stop - start))
  start = t.time()
  test_df['sentence_count'] = [len([sent.text for sent in nlp(tweet).sents]) for tweet in test_df['tweet']]
  stop = t.time()
  print("\n Count Sentence for test set: {}".format(stop - start))
  start = t.time()
  test_df['pos_tagged'] = [' '.join([token.lemma_ + '_' + token.pos_ for token in nlp(tweet) if not token.is_stop and not token.is_punct])for tweet in test_df['preprocessed']]
  stop = t.time()
  print("\n Adding Pos Tag to test set: {}".format(stop - start))
  start = t.time()
  test_df['noun'] = [' '.join([token.lemma_ for token in nlp(tweet) if not token.is_stop and not token.is_punct and (token.pos_ == 'NOUN' or token.pos_ == 'PROPN')]) for tweet in test_df['preprocessed']]
  stop = t.time()
  print("\n Noun and Proper Noun Extraction for test set: {}".format(stop - start))
  test_df = normalize(test_df)
  test_df.to_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_custom_test.csv', index=False)



 Count GPE Time for test set: 9.452808856964111

 Count Name_entity Time for test set: 9.230441808700562

 Count Sentence for test set: 8.813258409500122

 Adding Pos Tag to test set: 8.258421659469604

 Noun and Proper Noun Extraction for test set: 7.770461559295654


In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
import spacy
from sklearn.preprocessing import LabelBinarizer
nlp = spacy.load("en_core_web_sm")
import argparse
import scipy.sparse as sp
import pickle
import warnings
warnings.filterwarnings("ignore")

In [8]:
def identity(x):
    """Dummy function that just returns the input"""
    return x



def tokenizer(tweet):
    doc = nlp(tweet)
    tokens = [word.text for word in doc]
    return tokens


def get_score(classifier, X_test, Y_test):
    # Given a trained model, predict the label of a new set of data.
    Y_pred = classifier.predict(X_test)
    # Calculates the accuracy score of the trained model by comparing predicted labels with actual labels.
    acc = accuracy_score(Y_test, Y_pred)
    test_preds = pd.DataFrame(Y_pred).to_csv('/content/gdrive/MyDrive/Data/output2.csv')
    print(classification_report(Y_test, Y_pred))

    return acc


def base_model(vec, X_train, Y_train):
    print("Navie Bayes Classification")
    model = MultinomialNB()

    classifier = Pipeline([('vec', vec), ('cls', model)])
    classifier.fit(X_train, Y_train)

    return classifier


def optimize_rf(vec, X_train, Y_train, seed):
    print("Random Forest Classification")
    model = RandomForestClassifier(criterion='gini', n_estimators=233, max_depth=10, max_features=0.064,
                                   random_state=seed)

    classifier = Pipeline([('vec', vec), ('cls', model)])
    classifier.fit(X_train, Y_train)

    return classifier


def optimize_knn(vec, X_train, Y_train):
    print("KNN Classification")
    model = KNeighborsClassifier(n_neighbors=118, weights='uniform', n_jobs=-1)

    classifier = Pipeline([('vec', vec), ('cls', model)])
    classifier.fit(X_train, Y_train)

    return classifier


def optimize_dt(vec, X_train, Y_train):
    print("Decision Tree Classification")
    model = DecisionTreeClassifier(
        splitter='best',
        max_depth=14,
        max_features=0.81,
        criterion='entropy',
        random_state=0
    )

    classifier = Pipeline([('vec', vec), ('cls', model)])
    classifier.fit(X_train, Y_train)

    return classifier


def optimize_svm(vec, X_train, Y_train, seed):
    print("SVM classification")
    if vec is None:
        svm_ = svm.SVC(kernel='linear', C=1.14, random_state=seed)
    else:
        svm_ = Pipeline([('vec', vec), ('cls', svm.SVC(kernel='linear', C=1.14, random_state=seed))])

    svm_.fit(X_train, Y_train)

    return svm_


def custom_feature(row):
    dic = {}
    dic['org_count'] = row['org_count']
    dic['sentence_count'] = row['sentence_count']
    dic['gpe_count'] = row['gpe_count']
    return dic


def ensemble(vec, X_train, Y_train, seed):
    print("Ensemble of Naive Bayes, Random Forest and SVM")

    nb = Pipeline([('vec_cn', vec), ('cls', MultinomialNB())])
    rf = Pipeline([('vec_tf', vec), ('cls', RandomForestClassifier(criterion='gini', n_estimators=233, max_depth=10,
                                                                   max_features=0.064, n_jobs=-1, random_state=seed))])
    svm_ = Pipeline([('vec_tf', vec), ('cls', svm.SVC(kernel='linear', C=1.14, random_state=seed))])

    estimators = [('nb', nb), ('rf', rf), ('svm', svm_)]

    ensemble_classifier = VotingClassifier(estimators, voting='hard')
    classifier = ensemble_classifier.fit(X_train, Y_train)

    return classifier

In [9]:
if __name__ == "__main__":

    """ Below code refactored for the format python LFD_assignment2.py -i <trainset> -ts <testset>.
        Normally, it is used with split_data function to experiment with different classifiers. """

    train = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_train.csv')
    test= pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_custom_test.csv')
    X_train, Y_train = train['preprocessed'], train['task']

    X_test, Y_test = test['preprocessed'], test['task']
    # Create custom features dictionary
    train_dic = [custom_feature(row) for index, row in train.iterrows()]
    test_dic = [custom_feature(row) for index, row in test.iterrows()]

    dic_train_matr = DictVectorizer().fit_transform(train_dic)
    dic_test_matr = DictVectorizer().fit_transform(test_dic)

    # Applying TF-IDF on text
    vec = TfidfVectorizer().fit(train['preprocessed'])

    train_word_mat = vec.transform(train['preprocessed'])
    val_word_mat = vec.transform(test['preprocessed'])

    train_mat = sp.hstack((train_word_mat, dic_train_matr), format='csr')
    val_mat = sp.hstack((val_word_mat, dic_test_matr), format='csr')

    classifier = optimize_svm(None, train_mat, Y_train, 32)

    acc = get_score(classifier, val_mat, Y_test)
    print("\n Accuracy: {}".format(acc))

SVM classification
              precision    recall  f1-score   support

         NOT       0.88      0.69      0.77       620
         OFF       0.48      0.75      0.59       239

    accuracy                           0.71       859
   macro avg       0.68      0.72      0.68       859
weighted avg       0.77      0.71      0.72       859


 Accuracy: 0.7066356228172294
