In [None]:
import string
import re
import nltk
from nltk.corpus import stopwords
from pycontractions import Contractions
from textblob import TextBlob
from autocorrect import Speller 
from spellchecker import SpellChecker
from pycontractions import Contractions
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize # import word_tokenize
import joblib

classifier = joblib.load('resources/classifier.pkl')
tfidfVectorizer = joblib.load('resources/tfidfVectorizer.pkl')
cont = Contractions('resources/GoogleNews-vectors-negative300.bin')

In [206]:
def removePunctuation(text):
    # need to do this as would rather have two words than a single
    # word that is supposed to be hyphenated
    # this is kinda slow
    #text = text.replace('-',' ')
    #return "".join([x for x in text if x not in string.punctuation])
    # new method below is faster ?
    return text.translate(str.maketrans('', '', string.punctuation))

def removeMentionsAndURLs(text):
    mentions = r'@[A-Za-z0-9_]+'
    url1 = r'https?://[^ ]+'
    url2 = r'www.[^ ]+' 
    comb = r'|'.join((mentions, url1))
    comb = r'|'.join((comb, url2))
    return re.sub(comb,'',text)

def makeLowercase(text):
    return text.lower()

def removeStopwords(text): 
    return  ' '.join([x for x in text.split() if x not in stopwords.words('english')])


def expandContractions(text): 
    global cont
    theReturn = list(cont.expand_texts([text], precise=True))[0]
    return theReturn

#nltk.download('youll need to put the lemmatizer here')
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer() 

def lemmatizeString(text):
    theReturn = ''
    words = word_tokenize(text)
    for w in words:
        theReturn += lemmatizer.lemmatize(w) + ' '

    return theReturn

def isNaN(string):
    return string != string

def prepText(text):
    text = expandContractions(text)
    text = removeMentionsAndURLs(text)
    text = removeStopwords(text) #works
    text = removePunctuation(text) #works
    text = makeLowercase(text) #works
    text = lemmatizeString(text)
    return text

def normalizeSentiment(score):
    if (score[0] == 4):
        return 1
    else: 
        return 0
    
# feed this function an individual text string and it will output either a 0 or 4
def sentiScore(text):
    corpus = []
    corpus.append(text)
    x_tfid = tfidfVectorizer.transform(corpus).toarray()
    score = normalizeSentiment(classifier.predict(x_tfid))
    return score

def scoreRawText(text):
    if isNaN(text):
        return 0.5
    else:  
        return sentiScore(prepText(text))

In [207]:
import pandas as pd

In [208]:
nyt_gen = pd.read_csv('nyt_gen.csv',parse_dates=['date'],index_col=['date'])
nyt_covid = pd.read_csv('nyt_covid.csv',parse_dates=['date'],index_col=['date'])

In [209]:
def dailyAverage(df):
    df = df.resample('D').mean()
    return df

In [210]:
nyt_gen = nyt_gen.drop(columns=["Unnamed: 0","id","url"])
nyt_covid = nyt_covid.drop(columns=["Unnamed: 0","id","url"])

In [211]:
nyt_gen.head()

Unnamed: 0_level_0,abstract,headline,lead_paragraph,word_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-21,Mr. Greenwald is accused of being part of a “c...,Glenn Greenwald Charged With Cybercrimes in Br...,RIO DE JANEIRO — Federal prosecutors in Brazil...,1027
2020-01-21,David Sirota’s job is to remind the candidate’...,The Former Journalist Who Is Bernie Sanders’s ...,When the first in-house newsletter of Bernie S...,2036
2020-01-22,"The editor, Philip Jacobson, who works for the...",American Journalist Is Arrested in Indonesia O...,BANGKOK — An American journalist whose work fo...,553
2020-01-22,Pictures of the former Meghan Markle carrying ...,Lawyers for Harry and Meghan Warn British Medi...,"OTTAWA — Prince Harry and his wife, Meghan, ma...",793
2020-01-22,The White House hoped the president’s trip to ...,"Trump, in Davos to Talk Trade, Lashes Out at E...","DAVOS, Switzerland — President Trump said on W...",728


In [212]:
nyt_gen["scA"] = nyt_gen['abstract'].apply(lambda x : scoreRawText(x))
nyt_gen["scH"] = nyt_gen['headline'].apply(lambda x : scoreRawText(x))
nyt_gen["scL"] = nyt_gen['lead_paragraph'].apply(lambda x : scoreRawText(x))

nyt_covid["scA"] = nyt_covid['abstract'].apply(lambda x : scoreRawText(x))
nyt_covid["scH"] = nyt_covid['headline'].apply(lambda x : scoreRawText(x))
nyt_covid["scL"] = nyt_covid['lead_paragraph'].apply(lambda x : scoreRawText(x))

In [213]:
nyt_gen.head()

Unnamed: 0_level_0,abstract,headline,lead_paragraph,word_count,scA,scH,scL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-21,Mr. Greenwald is accused of being part of a “c...,Glenn Greenwald Charged With Cybercrimes in Br...,RIO DE JANEIRO — Federal prosecutors in Brazil...,1027,0.0,1,1.0
2020-01-21,David Sirota’s job is to remind the candidate’...,The Former Journalist Who Is Bernie Sanders’s ...,When the first in-house newsletter of Bernie S...,2036,0.0,1,1.0
2020-01-22,"The editor, Philip Jacobson, who works for the...",American Journalist Is Arrested in Indonesia O...,BANGKOK — An American journalist whose work fo...,553,0.0,0,0.0
2020-01-22,Pictures of the former Meghan Markle carrying ...,Lawyers for Harry and Meghan Warn British Medi...,"OTTAWA — Prince Harry and his wife, Meghan, ma...",793,1.0,1,1.0
2020-01-22,The White House hoped the president’s trip to ...,"Trump, in Davos to Talk Trade, Lashes Out at E...","DAVOS, Switzerland — President Trump said on W...",728,0.0,0,1.0


In [214]:
nyt_gen["scComb"] = nyt_gen["scA"]+nyt_gen["scH"]+nyt_gen["scL"]
nyt_covid["scComb"] = nyt_covid["scA"]+nyt_covid["scH"]+nyt_covid["scL"]

In [215]:
nyt_gen[0:50]

Unnamed: 0_level_0,abstract,headline,lead_paragraph,word_count,scA,scH,scL,scComb
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-21,Mr. Greenwald is accused of being part of a “c...,Glenn Greenwald Charged With Cybercrimes in Br...,RIO DE JANEIRO — Federal prosecutors in Brazil...,1027,0.0,1,1.0,2.0
2020-01-21,David Sirota’s job is to remind the candidate’...,The Former Journalist Who Is Bernie Sanders’s ...,When the first in-house newsletter of Bernie S...,2036,0.0,1,1.0,2.0
2020-01-22,"The editor, Philip Jacobson, who works for the...",American Journalist Is Arrested in Indonesia O...,BANGKOK — An American journalist whose work fo...,553,0.0,0,0.0,0.0
2020-01-22,Pictures of the former Meghan Markle carrying ...,Lawyers for Harry and Meghan Warn British Medi...,"OTTAWA — Prince Harry and his wife, Meghan, ma...",793,1.0,1,1.0,3.0
2020-01-22,The White House hoped the president’s trip to ...,"Trump, in Davos to Talk Trade, Lashes Out at E...","DAVOS, Switzerland — President Trump said on W...",728,0.0,0,1.0,1.0
2020-01-23,The president’s advisers have urged him not to...,"Seeing a Bloomberg Ad on Fox News, Trump Takes...","For weeks, President Trump’s advisers have urg...",1110,1.0,1,0.0,2.0
2020-01-24,The former associate of Rudolph W. Giuliani sa...,Lev Parnas Says He Has Recording of Trump Call...,WASHINGTON — A former associate of Rudolph W. ...,1366,1.0,1,1.0,3.0
2020-01-25,“He shouted at me for about the same amount of...,Pompeo Lashes Out at Reporter and Challenges H...,"Update: On Saturday, Mike Pompeo, the top Amer...",908,1.0,1,1.0,3.0
2020-01-25,"In an extraordinary statement, Secretary of St...","Pompeo Denounces News Media, Undermining U.S. ...",WASHINGTON — Secretary of State Mike Pompeo es...,1112,1.0,1,1.0,3.0
2020-01-25,The authorities had threatened a yearslong pri...,Indonesia Releases U.S. Journalist Detained Ov...,"JAKARTA, Indonesia — An American environmental...",534,0.0,0,1.0,1.0


In [216]:
import numpy as np
nyt_gen['equal'] = np.where(nyt_gen['scA']==nyt_gen['scH'],abs(nyt_gen['scA']),0)
nyt_gen['equal'] = np.where(nyt_gen['equal']==nyt_gen['scL'],1,0)

print('result similarity for senti vs sentiSC %')
print((nyt_gen['equal'].sum())/len(nyt_gen))

result similarity for senti vs sentiSC %
0.565


In [217]:
# and a rolling sum column, which we will also weight
nyt_gen['rolling_sum'] = nyt_gen['scComb'].rolling(3).sum()
nyt_covid['rolling_sum'] = nyt_covid['scComb'].rolling(3).sum()

nytDaily_gen = dailyAverage(nyt_gen)
nytDaily_covid = dailyAverage(nyt_covid)

In [227]:
nytDaily_gen.head()

Unnamed: 0_level_0,scA,scH,scL,scComb,rolling_sum
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-21,0.0,1.0,1.0,2.0,
2020-01-22,0.333333,0.333333,0.666667,1.333333,4.333333
2020-01-23,1.0,1.0,0.0,2.0,6.0
2020-01-24,1.0,1.0,1.0,3.0,6.0
2020-01-25,0.75,0.75,1.0,2.5,7.75


In [228]:
nytDaily_gen.to_csv('nytDaily_gen.csv')
nytDaily_covid.to_csv('nytDaily_covid.csv')