In [14]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [15]:

import pandas as pd 
import numpy as np
import time as t
import re
from nltk.stem import PorterStemmer
from sklearn.utils import resample, shuffle
pd.set_option('mode.chained_assignment', None)

import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

import spacy
nlp = spacy.load("en_core_web_sm")

ps = PorterStemmer()

In [4]:
def apply_stem(tweet):
	words = [ps.stem(w) for w in tweet.split()]
	return ' '.join(words)

def remove_newline(tweet):
	_tweet = re.sub('\n', '', tweet)
	return _tweet

	
def apply_lemma(tweet):
	doc = nlp(tweet)
	lemma = [token.lemma_ for token in doc]
	return ' '.join(lemma)
 
def remove_stopwords(tweet):
	doc = nlp(tweet)
	stop = [token.text for token in doc if not token.is_stop and not token.is_punct]
	return ' '.join(stop)
 	

def clean_data(tweet):
    splitted_tweet = tweet.lower().split()
    clean_tweet = []
    previous_word = None
    user_count = 0
    for word in splitted_tweet:
        #if word not in spacy_stopwords:
        word = re.sub("[#@]","",word)
        word = re.sub("!"," !",word)
        word = re.sub("[?]"," ?",word)
        
        if(word == "user"):
          user_count += 1
          
        if(word == "user" and previous_word == "user"):
          pass
        else:
          clean_tweet.append(word)
          
        previous_word = word
          
    return " ".join(clean_tweet), user_count

# Calculating number of Global Positioning Entity in a text
def count_gpe(txt):
	return sum([1 for token in nlp(txt).ents if token.label_ == 'GPE'])


# Claculating Number of Organisation in a Text
def count_org(txt):
	return sum([1 for token in nlp(txt).ents if token.label_ == 'ORG'])


# Calculating Number of Sentence in a text
def count_sentence(txt):
	doc = nlp(txt)
	return len([sent.text for sent in doc.sents])


#Extract only Noun and Proper Noun
def extract_noun(tweet):
	doc = nlp(tweet)
	cleaned_doc = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and (token.pos_ == 'NOUN' or token.pos_ == 'PROPN')]
	return ' '.join(cleaned_doc)


# Adding Pos Tag with the corresponding words
def spacy_pos(tweet):
	doc = nlp(tweet)
	cleaned = [token.lemma_ + '_' + token.pos_ for token in doc if not token.is_stop and not token.is_punct]
	return ' '.join(cleaned)


#Normalize the Custom Features
def normalize(df):
	df['sentence_count'] /= df['sentence_count'].max()
	df['gpe_count'] /= df['gpe_count'].max()
	df['org_count'] /= df['org_count'].max()
	
	return df


In [None]:
if __name__ == "__main__":
  test_df = test_df.merge(test_df.tweet.apply(lambda x:pd.Series({'preprocessed':clean_data(x)[0], 'user_count': clean_data(x)[1]})), left_index=True, right_index=True)
  test_df.drop('user_count', inplace=True, axis=1)
  start = t.time()
  test_df['gpe_count'] = [sum([1 for token in nlp(txt).ents if token.label_ == 'GPE']) for txt in test_df['tweet']]
  stop = t.time()
  print("\n Count GPE Time for test set: {}".format(stop - start))
  start = t.time()
  test_df['org_count'] = [sum([1 for token in nlp(txt).ents if token.label_ == 'ORG']) for txt in test_df['tweet']]
  stop = t.time()
  print("\n Count Name_entity Time for test set: {}".format(stop - start))
  start = t.time()
  test_df['sentence_count'] = [len([sent.text for sent in nlp(tweet).sents]) for tweet in test_df['tweet']]
  stop = t.time()
  print("\n Count Sentence for test set: {}".format(stop - start))
  start = t.time()
  test_df['pos_tagged'] = [' '.join([token.lemma_ + '_' + token.pos_ for token in nlp(tweet) if not token.is_stop and not token.is_punct])for tweet in test_df['preprocessed']]
  stop = t.time()
  print("\n Adding Pos Tag to test set: {}".format(stop - start))
  start = t.time()
  test_df['noun'] = [' '.join([token.lemma_ for token in nlp(tweet) if not token.is_stop and not token.is_punct and (token.pos_ == 'NOUN' or token.pos_ == 'PROPN')]) for tweet in test_df['preprocessed']]
  stop = t.time()
  print("\n Noun and Proper Noun Extraction for test set: {}".format(stop - start))
  test_df = normalize(test_df)
  test_df.to_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_custom_test.csv', index=False)



 Count GPE Time for test set: 11.76517391204834

 Count Name_entity Time for test set: 8.520888805389404

 Count Sentence for test set: 8.366979837417603

 Adding Pos Tag to test set: 8.46169900894165

 Noun and Proper Noun Extraction for test set: 7.821171998977661


In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
import spacy
from sklearn.preprocessing import LabelBinarizer
nlp = spacy.load("en_core_web_sm")
import argparse
import scipy.sparse as sp
import pickle
import warnings
warnings.filterwarnings("ignore")

In [24]:
def identity(x):
    """Dummy function that just returns the input"""
    return x


def get_score(classifier, X_test, Y_test,output_file):
    # Given a trained model, predict the label of a new set of data.
    Y_pred = classifier.predict(X_test)
    # Calculates the accuracy score of the trained model by comparing predicted labels with actual labels.
    acc = accuracy_score(Y_test, Y_pred)
    if output_file:
      test_preds = pd.DataFrame(Y_pred).to_csv('/content/gdrive/MyDrive/Data/output_custfeatures.csv')
    print(classification_report(Y_test, Y_pred))

    return acc


def optimize_svm(vec, X_train, Y_train, seed):
    print("SVM classification")
    if vec is None:
        svm_ = svm.SVC(kernel='linear', C=1.14, random_state=seed)
    else:
        svm_ = Pipeline([('vec', vec), ('cls', svm.SVC(kernel='linear', C=1.14, random_state=seed))])

    svm_.fit(X_train, Y_train)

    return svm_


def custom_feature(row):
    dic = {}
    dic['org_count'] = row['org_count']
    dic['sentence_count'] = row['sentence_count']
    dic['gpe_count'] = row['gpe_count']
    return dic


In [23]:
if __name__ == "__main__":
    output_file = False
    Tfidf = False

    train = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_train.csv')
    test= pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_custom_test.csv')
    
    X_train, Y_train = train['preprocessed'], train['task']

    X_test, Y_test = test['preprocessed'], test['task']
    # Create custom features dictionary
    train_dic = [custom_feature(row) for index, row in train.iterrows()]
    test_dic = [custom_feature(row) for index, row in test.iterrows()]

    dic_train_matr = DictVectorizer().fit_transform(train_dic)
    dic_test_matr = DictVectorizer().fit_transform(test_dic)

    # Applying TF-IDF on text
    if Tfidf:
      vec = TfidfVectorizer().fit(train['preprocessed'])
    else:
      vec = CountVectorizer().fit(train['preprocessed'])

    train_word_mat = vec.transform(train['preprocessed'])
    val_word_mat = vec.transform(test['preprocessed'])

    train_mat = sp.hstack((train_word_mat, dic_train_matr), format='csr')
    val_mat = sp.hstack((val_word_mat, dic_test_matr), format='csr')

    classifier = optimize_svm(None, train_mat, Y_train, 32)

    acc = get_score(classifier, val_mat, Y_test,output_file)
    print("\n Accuracy: {}".format(acc))

SVM classification
              precision    recall  f1-score   support

         NOT       0.82      0.69      0.75       647
         OFF       0.56      0.71      0.62       352

    accuracy                           0.70       999
   macro avg       0.69      0.70      0.69       999
weighted avg       0.72      0.70      0.70       999


 Accuracy: 0.6976976976976977
