Import nltk and newspaper, which will be used to tokenize, clean, and lemmatize the article.

In [1]:
import nltk
from newspaper import Article
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

Use the newspaper library to parse the text from the article's URL:


In [2]:
def parseArticle(url):
    article= Article(url)
    article.download()
    article.parse()
    text = article.text
    return(text)

Use NLTK to tokenize, remove stop words, and lemmatize the text:

In [3]:
def preprocessText(text):
    tokens = word_tokenize(text)
    
    sr = stopwords.words('english')
    for token in tokens:
        if token in sr:
            tokens.remove(token)
        elif token in ['’', '"','.', "!", "?", ","]:
            tokens.remove(token)
            
    lemmatizer= WordNetLemmatizer()
    lemmatized=[]
    for token in tokens:
        lemmatized.append(lemmatizer.lemmatize(token))
    return(lemmatized)

extractEmotion(url) takes a url string as an argument and calls the helper functions parseArticle(url) and preprocessText(text), defined above. It takes a tally of the words in the article that are associated with each emotion in the NRC Emotion Lexicon. To get the emotion score, the function divides the tally by the total number of words in the article that were found in the dictionary.

In [4]:
def extractEmotion(url):
    
    #store data from emotion lexicon into a nested dictionary
    emoLex= open('NRC_emotion_lexicon_list.txt', 'r')
    emoDict= {}
    for line in emoLex:
        buffer=line.split()
        if buffer[0] in emoDict:
            emoDict[buffer[0]].update({buffer[1]:buffer[2]})
        else:
            emoDict[buffer[0]]={buffer[1]:buffer[2]}
    emoLex.close()

    text= parseArticle(url)
    lemmatized= preprocessText(text)

    anger= 0
    anticipation= 0
    joy= 0
    sadness= 0
    fear= 0
    disgust= 0
    trust= 0
    surprise=0
    positive=0
    negative=0
    total=0
    found=0

    for token in lemmatized:
        total+=1
        if token in emoDict:
            found+=1
            anger+= int(emoDict[token]['anger'])
            anticipation+= int(emoDict[token]['anticipation'])
            joy+= int(emoDict[token]['joy'])
            sadness+= int(emoDict[token]['sadness'])
            fear+= int(emoDict[token]['fear'])
            disgust+= int(emoDict[token]['disgust'])
            trust+= int(emoDict[token]['trust'])
            surprise+= int(emoDict[token]['surprise'])
            
            positive+=int(emoDict[token]['positive'])
            negative+=int(emoDict[token]['negative'])
            
    emotions=[anger/found, anticipation/found, joy/found, trust/found, fear/found, surprise/found, sadness/found, disgust/found]
    return emotions

The code below runs extractEmotion(url) for each url in a list taken from the excel file provided. It writes the resulting emotion scores to a csv file.

In [8]:
url_list=open("url_list.txt", "r")
output=open("testoutput.csv","a")
for url in url_list:
    printline="\n" + url.rstrip('\n') + ","
    try:
        emotions=extractEmotion(url.rstrip('\n'))
    except:
        output.write(printline)
        continue
    else:
        for i in emotions:
            printline+= str(i) + ","
        output.write(printline)
    
url_list.close()
output.close()



You must `download()` an article first!
