In [27]:
import pandas as pd
import numpy as np
import random 



np.random.seed(42)
random.seed(42)

## text cleaning
import re
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import nltk
from nltk import pos_tag
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
#nltk.download('stopwords' )

import string
from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

stop_words_skt = text.ENGLISH_STOP_WORDS
stop_words_en = stopwords.words('english')

combined_stopwords = set.union(set(stop_words_en),set(punctuation),set(stop_words_skt))

In [28]:
# read csv file
filename = "Project_1_dataset_01_01_2022.csv"
df = pd.read_csv(filename)

train, test = train_test_split(df, test_size=0.2, random_state = 42)

In [29]:
def clean(text):
    '''
    Helps remove many HTML artefacts from the crawler's output.
    '''
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

def get_tokens(doc: str):
    '''
    Clean full text without any stemming or lemmatization
    '''
    doc = clean(doc)
    doc = doc.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(doc)
    lower_txt = [token.lower() for token in tokens]
    remove_words = [token for token in lower_txt if (not token.isdigit())\
                     and (token not in combined_stopwords) and (len(token)>1)]
    return (word for word in remove_words)


def data_to_feature_mapping (data: pd.DataFrame(), embeddings_dict: dict, embed_size = 300):
    '''
    - Get feature mapping for the text. 
    - Creates a n*p np array where n is the number of data points, p is the number of features
    '''
    feature_map = np.zeros(embed_size)
    for index, row in data.iterrows():
#         keywords = row['keywords']
#         print(keywords)
        keywords = get_tokens(row['full_text'])
        embed = get_document_vector(keywords, embeddings_dict, embed_size)
        feature_map = np.vstack((feature_map,embed))
    feature_map = feature_map[1:,:]
    return feature_map

def get_document_vector (doc_keywords:list, embeddings_dict: dict, embed_size = 300 ):
    '''
    From the list of keywords representing the document, get a document embedding by averaging the embedding over words
    '''
    representation = np.zeros(embed_size)
    count = 0
    for word in doc_keywords:
        try:
            embed = embeddings_dict[word]
            count = count+1
            representation = np.add(representation, embed)    
        except:
            count = count
    return (representation / count)


In [30]:
embed_size = [50,100,200,300]

clf = LogisticRegression(penalty = 'none',max_iter=10000, random_state=42)
# clf = SVC(kernel='linear', C=10, random_state=42)
label_encoder = LabelEncoder()

## label encoding
y_train = label_encoder.fit_transform(train['root_label'])
y_test = label_encoder.transform(test['root_label'])

## report test accuracy on different embedding size
accuracy = []
accuracy_train = []
for size in embed_size:
    
    embeddings_dict = {}
    with open("glove/glove.6B."+ str(size)+"d.txt", 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    
    X_train = data_to_feature_mapping(train, embeddings_dict, size) ## create feature set using the embeddings
    model = clf.fit(X_train, y_train)

    X_test = data_to_feature_mapping(test,embeddings_dict, size)
    y_test_pred = clf.predict(X_test)
    y_train_pred = clf.predict(X_train)
    accuracy.append(metrics.accuracy_score(y_test, y_test_pred))
    accuracy_train.append(metrics.accuracy_score(y_train, y_train_pred))

In [31]:
## accuracy on test set
accuracy

[0.9566265060240964,
 0.9518072289156626,
 0.9662650602409638,
 0.9662650602409638]

In [32]:
## accuracy on train set
accuracy_train

[0.9480989740494871,
 0.9837054918527459,
 0.9981894990947495,
 0.9981894990947495]