## This file contains functions to be used in the LDA Reading Recommender code

Created by Patrick Steeves for Independent Study with Professor Kanungo <br>
George Washington University, 12/23/2017

In [None]:
import pandas as pd
import zipfile
from urllib.request import urlretrieve
import time
import matplotlib.pyplot as plt
import numpy as np
from gensim.models.phrases import Phrases
from gensim.models.phrases import Phraser
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary

Import data from GitHub as 6 separate zipped files. Combine files and compute wordcounts for all articles

In [None]:
def importData():
    url = "https://github.com/psteeves/NLP-projects/raw/master/LDA%20Reading%20Recommender/Data/"

    textnums = range(7)   # Number of files to import
    temp = []
    for i in textnums:
        print("Importing and unzipping file {}/{}...".format(i+1, textnums[-1]+1))
        file = 'articles'+str(i)+'.zip'
        urlretrieve(url+file, filename=file)

        zip_ref = zipfile.ZipFile(file, 'r')
        zip_ref.extractall()
        zip_ref.close()

        temp.append(pd.read_csv(file.replace('zip','csv'), encoding = 'utf-8'))

    data = pd.concat(temp).reset_index(drop=True)

    print("Computing word counts")
    data['word_count'] = data.content.apply(lambda x: len(x.split()))
    data = data.drop(['Unnamed: 0','id','author','date','year','month','url'],axis=1)
    return data

<br><br>Clean each article by tokenizing and lemmatizing words and filtering stopwords

In [None]:
def cleanData(data,text_column):
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')

    start = time.time()
    print("Starting cleaning...")
    print("Tokenizing...")
    data['tokens'] = [tokenizer.tokenize(text.lower()) for text in data[text_column]]
    print("Lemmatizing and filtering stopwords...")
    data['tokens'] = [[lemmatizer.lemmatize(token) for token in text if len(token) > 1 and token not in stopwords.words('english')] for text in data['tokens']]

    print("Took {:.0f} minutes to clean texts".format((time.time()-start)/60))
    return data

<br><br>Recognize and add bigrams to articles, such as New York, North Korea, etc.

In [None]:
def addBigrams(df, token_column):
    phrases = Phrases(df[token_column], min_count = 150)
    bigrams = Phraser(phrases)
    for idx in range(len(df[token_column])):
        for token in bigrams[df[token_column][idx]]:
            if '_' in token:
                df[token_column][idx].append(token)
    return bigrams, df

<br><br>Transform alphabetical tokenized articles into bag-of-words

In [None]:
def createBOW(df,token_column):
    dictionary = Dictionary(df.loc[:,token_column])
    dictionary.filter_extremes(no_below = 150, no_above = 0.6)
    df['bow'] = [dictionary.doc2bow(doc) for doc in df[token_column]]
    return df, dictionary

<br><br>Given a series of articles as bag of words, compute PDF for each article

In [None]:
def getDocTopics(bow_series, model):
    topic_pairs = bow_series.apply(lambda x: model.get_document_topics(x, minimum_probability = 1e-8))
    topics = [np.array([prob[1] for prob in row]) for row in topic_pairs]
    return topics

<br><br>When given a series of numbers, return the second, third, and fourth smallest values

In [None]:
def smallestNums(series):
    position1 = None
    position2 = None    # position of 2nd smallest num
    position3 = None    # position of 3rd smallest num
    position4 = None    # position of 4th smallest num
    step = 0
    m1, m2, m3, m4 = float('inf'), float('inf'), float('inf'), float('inf')
    for num in series:
        if num <= m1:
            m1, m2, m3, m4 = num, m1, m2, m3
            position4 = position3
            position3 = position2
            position2 = position1
            position1 = step
        elif num < m2:
            m2, m3, m4 = num, m2, m3
            position4 = position3
            position3 = position2
            position2 = step
        elif num < m3:
            m3, m4 = num, m3
            position4 = position3
            position3 = step
        elif num < m4:
            m4 = num
            position4 = step
        step += 1
    return position2, position3, position4

<br><br>When fed the index of an article, compute the Euclidian distance between each article's PDF and the given article's PDF. Then return the second, third, and fourth smallest distances (the smallest distance is with itself and will equal 0)

In [None]:
def similarArticles(articlenum):
    pdf = training_data.iloc[articlenum,6]
    diff = training_data.topics.apply(lambda x: np.linalg.norm(pdf - x))
    return smallestNums(diff)