In [1]:
import csv
import pandas as pd

In [2]:
##########################################################################################
processDate = "2016-09-01"

def removeNonASCIICharacters(textString): 
    return "".join(i for i in textString if ord(i)<128)

df = pd.read_csv(r'C:\Users\goldm\Capstone\data\articles.csv')
df.drop_duplicates('content')
df = df[~df['content'].isnull()]
df=df[df['content'].str.len()>=200]

targetString="(Want to get this briefing by email?"
df['NYT summary']=df['content'].map(lambda d: d[:len(targetString)]==targetString)
df=df[df['NYT summary']==False]

# The following removes a warning that appears in many of the Atlantic articles.
# Since it is commonly at the beginning, it brings a lot of noise to the search for similar articles
# And subsequently to the assessment of sentiment
targetString="For us to continue writing great stories, we need to display ads.             Please select the extension that is blocking ads.     Please follow the steps below"
df['content']=df['content'].str.replace(targetString,'')

# This is also for some Atlantic articles for the same reasons as above
targetString="This article is part of a feature we also send out via email as The Atlantic Daily, a newsletter with stories, ideas, and images from The Atlantic, written specially for subscribers. To sign up, please enter your email address in the field provided here."
df=df[df['content'].str.contains(targetString)==False]

# This is also for some Atlantic articles for the same reasons as above
targetString="This article is part of a feature we also send out via email as Politics  Policy Daily, a daily roundup of events and ideas in American politics written specially for newsletter subscribers. To sign up, please enter your email address in the field provided here."
df=df[df['content'].str.contains(targetString)==False]

# More Atlantic-specific removals (for daily summaries with multiple stories contained)
df=df[df['content'].str.contains("To sign up, please enter your email address in the field")==False]

# Remove daily CNN summary
targetString="CNN Student News"
df=df[df['content'].str.contains(targetString)==False]

print("\nArticle counts by publisher:")
print(df['publication'].value_counts())

print("\nArticle counts by date:")
print(df['date'].value_counts())

# Restrict to articles on the provided input date.
# This date is considered mandatory for topic clustering but is not required for sentiment
# since sentiment only processes a specified list of articles.
# For topic clustering it is essential to have the date as it is
# enormously significant in article matching.
# if processDate!=None:
#     df=df[df['date']==processDate]
# df.reset_index(inplace=True, drop=True)

# Remove non-ASCII characters
df['content no nonascii']=df['content'].map(lambda x: removeNonASCIICharacters(x))

print("\nFinal dataset:\n\nDate:",processDate,"\n")
print(df['publication'].value_counts())


Article counts by publisher:
Breitbart           104
NY Post              61
CNN                  57
Reuters              56
NPR                  54
Washington Post      50
NY Times             50
Atlantic             48
Buzzfeed News        48
Business Insider     41
Guardian             35
National Review      32
Fox News             28
Name: publication, dtype: int64

Article counts by date:
2016-12-02    362
2016-09-01    302
Name: date, dtype: int64

Final dataset:

Date: 2016-09-01 

Breitbart           104
NY Post              61
CNN                  57
Reuters              56
NPR                  54
Washington Post      50
NY Times             50
Atlantic             48
Buzzfeed News        48
Business Insider     41
Guardian             35
National Review      32
Fox News             28
Name: publication, dtype: int64


In [25]:
article_df = df

In [12]:
def loadStopWords(stopWordsFileName):
    stop_words=[]
    f=open(stopWordsFileName,'r')
    for l in f.readlines():
        stop_words.append(l.replace('\n', ''))
    return stop_words
stop_words = loadStopWords(runParams['stop_words_file'][0])
#NOTE: alternative to importing a flat file of stop words is to just import stop words from the various different libraries. 
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))

In [36]:
import argparse
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid
import csv

runParams={'tfidf_maxdf':      [0.5],
           'input_file':       ['./data/articles.csv'],
           'story_threshold':  [0.26],
           'process_date':     ['2016-09-01'],
           'parts_of_speech':  [['PROPER', 'VERB']],
           'lemma_conversion': [False],
           'ngram_max':        [3],
           'tfidf_binary':     [False],
           'tfidf_norm':       ['l2'],
           'nlp_library':      ['nltk'],
           'max_length':       [50],
           'stop_words_file':  ['./data/stopWords.txt'],
           'tfidf_mindf':      [2],
           'display_graph':    [True],
           'article_stats':    [False]}
# Use parameter grid even if there is only one set of parameters
parameterGrid = ParameterGrid(runParams)

partsOfSpeech=[]
pos_nlp_mapping = {}
pos_nlp_mapping['nltk']={'VERB':['VB','VBD','VBG','VBN','VBP','VBZ'],'PROPER':['NNP','NNPS'],'COMMON':['NN','NNS']}

for pos in runParams['parts_of_speech'][0]:
    partsOfSpeech.append(pos_nlp_mapping['nltk'][pos])
partsOfSpeech=[item for sublist in partsOfSpeech for item in sublist]
print(partsOfSpeech)

import nltk as nl
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer=WordNetLemmatizer()

stringToConvert = article_df['content']
partsOfSpeech = partsOfSpeech
stop_words = stop_words

['NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


In [23]:
import nltk as nl
from nltk import word_tokenize, sent_tokenize

def stringNLTKProcess(nl, stringToConvert,partsOfSpeech,stop_words,maxWords=None,lemmatizer=None):
    #parses the paragraph into sentences
    sentences = nl.sent_tokenize(stringToConvert)
    str = []
    for sentence in sentences:
        wordString=[]
        for word, pos in nl.pos_tag(nl.word_tokenize(sentence)):
            # The following condition avoids any POS which corresponds to punctuation (and takes all others
            if partsOfSpeech == None:
                if pos[0]>='A' and pos[0]<='Z':
                    wordString.append(word)
            elif pos in partsOfSpeech:
                wordString.append(word)
        for wrd in wordString:
            #converts all string characters into lowercase elements
            wrdlower=wrd.lower()
            if wrdlower not in stop_words and wrdlower!="'s'":
                if maxWords==None or len(str)<maxWords:
                    if lemmatizer==None:
                        str.append(wrdlower)
                    else:
                        str.append(lemmatizer.lemmatize(wrd.lower(), pos='v'))
            if maxWords!=None and len(str)==maxWords:
                return ' '.join(str)
    return ' '.join(str)

def removeSpacesAndPunctuation(textString): 
    return "".join(i for i in textString if (ord(i)>=48 and ord(i)<=57) or (ord(i)>=97 and ord(i)<=122))

In [27]:
article_df['input to vectorizer'] = article_df['content no nonascii'].map(lambda x: stringNLTKProcess(nl, x,partsOfSpeech,stop_words,maxWords=None,lemmatizer=None))

In [29]:
def preprocessAndVectorize(articleDataFrame, args, pos_nlp_mapping, nlp,nl, wordnet_lemmatizer,stop_words):
    vectorizer = TfidVectorizer(analyzer='word' , ngram_range=(1,))
    
    vectorizer = TfidVectorizer(analyzer='word',
                                ngram_range=(1,args['ngram_max'][0]),
                                lowercase=True,
                                binary=args['tfidf_binary'][0],
                                **optArgsForVectorizer)
    tfidVectors=vectorizer.fit_transform(articleDataFrame['input to vectorizer'])
    terms=vectorizer.get_feature_names()
    return tfidVectors, terms

In [31]:
def initialiseAllNonZeroCoords(tfidVectors):
    #This function just exists isnce it seems to be expensive and I'd rather not call it multiple times
    #Hence it is intended to be called outside of loops in order to simplify the row specific processing
    values=[]
    nzc=zip(*tfidVectors.nonzero())
    
    #In Python 3 the zip can only be iterated through one time before it is automatically realeased
    ## So need to copy the results otherwise the main loop below will no longer work
    pointList=[]
    for i,j in nzc:
        pointList.append([i,j])
        
    for row in range(tfidVectors.shape[0]):
        rowList=[]
        for i,j in pointList:
            if row==i:
                rowList.append(j)
        values.append(rowList)
        
    return values

In [32]:
def scoreCurrentParamGuess(tfidVectors,storyMap,articleDataFrame,threshold,printErrors=False):
    #Work with distances relative to first item in each cluster - even though this is clearly arbitrary since that
    #point could be an outlier in the cluster and hence might cause some problems.
    #But I have to start somewhere - and can refine it later if needed.
    
    nonZeroCoords=initialiseAllNonZeroCoords(tfidVectors)
    score=0
    outGood=0
    outBad=0
    inGood=0
    inBad=0
    for story, storyArticles in storyMap.items():
        leadArticleIndex=articleDataFrame[articleDataFrame['id']==storyArticles[0].index[0]]
        #Compute score of all articles in corpus relative to the first article in the story ( dot product)
        #Then count through the list relative to the threshold (add one for a good result, subtract one for a bad result)
        scores=productRelatednessScores(tfidfVectors,nonZeroCoords,leadArticleIndex)
        rankedIndices=np.argsort(scores)
        foundRelatedArticles=[]
        # The sorting here is not strictly required,ubt i could use it so that once the threshold is passed
        # in the loop, then i infer the remaining results
        for article in reversed(rankedIndices):
            thisArticleIndex=articleDataFrame['id'][article]
            if thisArticleIndex in storyArticles:
                if scores[article]>=threshold:
                    score+=1
                    inGood+=1
                else:
                    score-=1
                    inBad+=1
                    if printErrors:
                        print("ERROR:",thisArticleIndex,"should be in", story)
            else: # article not supposed to be in range
                if scores[article]<=threshold:
                    score+=1
                    outGood+=1
                else:
                    score-=1
                    outBad+=1
                    if printErrors:
                        print("ERROR:", thisArticleIndex,"should NOT be in", story)
        scoreDict={'score':score, 'inGood': inGood, 'inBad': inBad, "outGood": outGood, 'outBad': outBad}
        return scoreDict

In [None]:
# Loop across all parameter combinatoins in grid to determine best set
# If not doing grid search, will just pass through the loop once
bestParamScoreDict={'score':-1000000}
betsParams=parameterGrid[0]

for i,currentParams in enumerate(parameterGrid):
    if len(parameterGrid)>1:
        print("Combination:", i+1, "of", len(parameterGrid))
        print(currentParams)
        
    # Determine tf-idf vectors
    # terms is just used later on if analysis of final results is requested
    
    tfidVectors, terms=proprocessAndVectorize(articleDataFrame,
                                             currentParams,
                                             pos_nlp_mapping,
                                             nlp,
                                             nl,
                                             wordnet_lemmatizer,
                                             stop_words)
    
    # Computes scores if threshold provided (meaning as part of grid search)
    if 'story_threshold' in currentParams and currentParams['story_threshold']!=None:
        scoreDict=scoreCurrentParamGuess(tfidVectors,storyMap,articleDataFrame,currentParams['story_threshold'])
        print(scoreDict)
        
        #Update best so far
        if scoreDict['score']>=bestParamScoreDict['score']:
            if len(parameterGrid)>1:
                print(i+1,"is the best so far!")
            bestParams = currentParams
            bestParamScoreDict = scoreDict
    # End grid/parameter loop

In [None]:
# Loop across all parameter combinations in grid to determine best set
# If not doing grid search, will just pass through the loop once

bestParamScoreDict={'score':-1000000}
bestParams=parameterGrid[0]

for i,currentParams in enumerate(parameterGrid):
	if len(parameterGrid)>1:
		print("Combination:",i+1,"of",len(parameterGrid))
		print(currentParams)

	# Determine tf-idf vectors
	# terms is just used later on if analysis of final results is requested
	tfidfVectors,terms=preprocessAndVectorize(articleDataFrame,
											  currentParams,
											  pos_nlp_mapping,
											  nlp,
											  nl,
											  wordnet_lemmatizer,
											  stop_words)

	# Compute scores if threshold provided (meaning as part of grid search)
	if 'story_threshold' in currentParams and currentParams['story_threshold']!=None:
		scoreDict=scoreCurrentParamGuess(tfidfVectors,storyMap,articleDataFrame,currentParams['story_threshold'])
		print(scoreDict)

		# Update best so far
		if scoreDict['score']>=bestParamScoreDict['score']:
			if len(parameterGrid)>1:
				print(i+1,"is the best so far!")
			bestParams=currentParams
			bestParamScoreDict=scoreDict
	# End grid/parameter loop