In [1]:
import nltk

In [2]:
text = """Monticello wasn't designated as UNESCO World Heritage Site until 1987"""

In [4]:
import regex
regex.split("[\s\.\,]", text)

['Monticello',
 "wasn't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987']

# Creating better tokens

In [5]:
nltk.word_tokenize(text)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987']

## Better Stemmers 

In [9]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

plurals = ['caress', "flies", "dies", "mules", "denied", "died" , "agreed" , "owned", "humbled", "sized", "itemization", " siezing", "sensational" , "traditional" , "reference" , "colonizer" , "plotted" ]

for word in plurals:
    print(f"{word} >>> {stemmer.stem(word)}")

caress >>> caress
flies >>> fli
dies >>> die
mules >>> mule
denied >>> deni
died >>> die
agreed >>> agre
owned >>> own
humbled >>> humbl
sized >>> size
itemization >>> item
 siezing >>>  siez
sensational >>> sensat
traditional >>> tradit
reference >>> refer
colonizer >>> colon
plotted >>> plot


In [10]:
from nltk.stem.snowball import SnowballStemmer

SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [11]:
sn_stemmer = SnowballStemmer("english")

In [12]:
sn_stemmer.stem("generously")

'generous'

In [13]:
stemmer.stem("generously")

'gener'

# Lemmatization

In [134]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [138]:
for word in plurals:
    print(f"{word} >>> {lemmatizer.lemmatize(word, pos= 'v') }")
    print(f"{word} >>> {lemmatizer.lemmatize(word)}")

caress >>> caress
caress >>> caress
flies >>> fly
flies >>> fly
dies >>> die
dies >>> dy
mules >>> mules
mules >>> mule
denied >>> deny
denied >>> denied
died >>> die
died >>> died
agreed >>> agree
agreed >>> agreed
owned >>> own
owned >>> owned
humbled >>> humble
humbled >>> humbled
sized >>> size
sized >>> sized
itemization >>> itemization
itemization >>> itemization
 siezing >>>  siezing
 siezing >>>  siezing
sensational >>> sensational
sensational >>> sensational
traditional >>> traditional
traditional >>> traditional
reference >>> reference
reference >>> reference
colonizer >>> colonizer
colonizer >>> colonizer
plotted >>> plot
plotted >>> plotted


# Removing stop words 

In [16]:
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goldm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords

In [23]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'isn', 'ma', 'he', 'so', 'theirs', "weren't", 'herself', 'mustn', 'nor', 'of', 'had', 'or', 're', 'down', "couldn't", 'which', 'itself', 'here', 'his', 'now', 'at', 'are', 'above', 'then', 'll', "won't", 's', 'this', 'off', 'while', 'few', 'my', 'mightn', 'against', 'doing', 'further', "didn't", 'when', "needn't", 'each', 'himself', "you'd", 'm', 'been', 'those', "doesn't", 'about', 'is', 'the', 'some', 'why', "hadn't", 'y', 'do', 'until', 'myself', 'themselves', 'can', 't', 'by', 'aren', "aren't", 'wasn', 'have', 'don', 'should', 'yourself', 'very', 'won', 'am', 'having', 'an', 'they', 'such', 'weren', 'but', "you'll", "should've", 'were', 'haven', 'has', 'be', 'what', 'couldn', 'didn', 'other', 'yourselves', 'after', 'only', 'd', 'not', "mustn't", "wasn't", 'same', "wouldn't", 'below', 'ours', 'does', 'from', 'i', 'hers', 'shouldn', 'into', 'there', 'more', 'out', "hasn't", 'me', 'up', 'and', 'that', 'both', "don't", "you're", 'through', "it's", "you've", 'a', 'than', 'again', 'she'

In [19]:
text = """Welcome you to programming knowledge. Lets start with our first tutorial on NLTK. We shall learn the basics of NLTK here."""

demoWords = ["playing", "happiness" , "going" , "doing" , "yes" , " no" , "I" , "having", "had" , "haved"]

In [29]:
from nltk.tokenize import word_tokenize, sent_tokenize
tokenize_words = word_tokenize(text)
print(words)

['Welcome', 'you', 'to', 'programming', 'knowledge', '.', 'Lets', 'start', 'with', 'our', 'first', 'tutorial', 'on', 'NLTK', '.', 'We', 'shall', 'learn', 'the', 'basics', 'of', 'NLTK', 'here', '.']


In [67]:
from nltk.tokenize import word_tokenize, sent_tokenize
tokenize_words = sent_tokenize(text)
print(words)

['Welcome', 'you', 'to', 'programming', 'knowledge', '.', 'Lets', 'start', 'with', 'our', 'first', 'tutorial', 'on', 'NLTK', '.', 'We', 'shall', 'learn', 'the', 'basics', 'of', 'NLTK', 'here', '.']


In [32]:
tokenize_words_without_stop_words = []
for word in tokenize_words:
    if word not in stop_words:
        tokenize_words_without_stop_words.append(word)
print(set(tokenize_words_without_stop_words))

removed_words = set(tokenize_words) - set(tokenize_words_without_stop_words)
print(removed_words)

{'learn', 'NLTK', 'knowledge', 'shall', 'programming', 'Welcome', 'Lets', 'start', 'tutorial', 'basics', '.', 'We', 'first'}
{'the', 'you', 'here', 'with', 'of', 'on', 'to', 'our'}


In [37]:
##testing lemmatization of words
nltk.pos_tag(tokenize_words_without_stop_words)

[('Welcome', 'JJ'),
 ('programming', 'NN'),
 ('knowledge', 'NN'),
 ('.', '.'),
 ('Lets', 'NNP'),
 ('start', 'VBP'),
 ('first', 'JJ'),
 ('tutorial', 'JJ'),
 ('NLTK', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('shall', 'MD'),
 ('learn', 'VB'),
 ('basics', 'NNS'),
 ('NLTK', 'NNP'),
 ('.', '.')]

In [26]:
import csv
import pandas as pd

In [27]:
##########################################################################################
processDate = "2016-09-01"

def removeNonASCIICharacters(textString): 
    return "".join(i for i in textString if ord(i)<128)

df = pd.read_csv(r'C:\Users\goldm\Capstone\data\articles.csv')
df.drop_duplicates('content')
df = df[~df['content'].isnull()]
df=df[df['content'].str.len()>=200]

targetString="(Want to get this briefing by email?"
df['NYT summary']=df['content'].map(lambda d: d[:len(targetString)]==targetString)
df=df[df['NYT summary']==False]

# The following removes a warning that appears in many of the Atlantic articles.
# Since it is commonly at the beginning, it brings a lot of noise to the search for similar articles
# And subsequently to the assessment of sentiment
targetString="For us to continue writing great stories, we need to display ads.             Please select the extension that is blocking ads.     Please follow the steps below"
df['content']=df['content'].str.replace(targetString,'')

# This is also for some Atlantic articles for the same reasons as above
targetString="This article is part of a feature we also send out via email as The Atlantic Daily, a newsletter with stories, ideas, and images from The Atlantic, written specially for subscribers. To sign up, please enter your email address in the field provided here."
df=df[df['content'].str.contains(targetString)==False]

# This is also for some Atlantic articles for the same reasons as above
targetString="This article is part of a feature we also send out via email as Politics  Policy Daily, a daily roundup of events and ideas in American politics written specially for newsletter subscribers. To sign up, please enter your email address in the field provided here."
df=df[df['content'].str.contains(targetString)==False]

# More Atlantic-specific removals (for daily summaries with multiple stories contained)
df=df[df['content'].str.contains("To sign up, please enter your email address in the field")==False]

# Remove daily CNN summary
targetString="CNN Student News"
df=df[df['content'].str.contains(targetString)==False]

print("\nArticle counts by publisher:")
print(df['publication'].value_counts())

print("\nArticle counts by date:")
print(df['date'].value_counts())

# Restrict to articles on the provided input date.
# This date is considered mandatory for topic clustering but is not required for sentiment
# since sentiment only processes a specified list of articles.
# For topic clustering it is essential to have the date as it is
# enormously significant in article matching.
# if processDate!=None:
#     df=df[df['date']==processDate]
# df.reset_index(inplace=True, drop=True)

# Remove non-ASCII characters
df['content no nonascii']=df['content'].map(lambda x: removeNonASCIICharacters(x))

print("\nFinal dataset:\n\nDate:",processDate,"\n")
print(df['publication'].value_counts())






Article counts by publisher:
Breitbart           104
NY Post              61
CNN                  57
Reuters              56
NPR                  54
NY Times             50
Washington Post      50
Buzzfeed News        48
Atlantic             48
Business Insider     41
Guardian             35
National Review      32
Fox News             28
Name: publication, dtype: int64

Article counts by date:
2016-12-02    362
2016-09-01    302
Name: date, dtype: int64

Final dataset:

Date: 2016-09-01 

Breitbart           104
NY Post              61
CNN                  57
Reuters              56
NPR                  54
NY Times             50
Washington Post      50
Buzzfeed News        48
Atlantic             48
Business Insider     41
Guardian             35
National Review      32
Fox News             28
Name: publication, dtype: int64


In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content,NYT summary,content no nonascii
0,3079,20694,"Review: In ‘Independence Day: Resurgence,' the...",NY Times,Manohla Dargis,2016-12-02,2016.0,12.0,,"If you've seen one movie apocalypse, you have ...",False,"If you've seen one movie apocalypse, you have ..."
1,3241,20875,The Agony of the Digital Tease - The NY Times,NY Times,Jessica Bennett,2016-12-02,2016.0,12.0,,There was the breadcrumb dropped on Valentine'...,False,There was the breadcrumb dropped on Valentine'...
2,3336,20986,Fox News's Convention Moment Overshadowed by S...,NY Times,Jim Rutenberg,2016-12-02,2016.0,12.0,,CLEVELAND — This was supposed to be Fox New...,False,CLEVELAND This was supposed to be Fox News...
3,3722,21413,"One Star Over, a Planet That Might Be Another ...",NY Times,Kenneth Chang,2016-09-01,2016.0,9.0,,Another Earth could be circling the star right...,False,Another Earth could be circling the star right...
4,3748,21448,University of Chicago Strikes Back Against Cam...,NY Times,"Richard Pérez-Peña, Mitch Smith and Stephanie ...",2016-09-01,2016.0,9.0,,The anodyne welcome letter to incoming freshme...,False,The anodyne welcome letter to incoming freshme...


In [29]:
df2 = pd.DataFrame([[1,2,3],[5,6,7],[8,9,10]])
df2.head()

Unnamed: 0,0,1,2
0,1,2,3
1,5,6,7
2,8,9,10


In [30]:
#testing the map function
df2[1] = df2[1].map(lambda x: 200*x)
df2.head()

Unnamed: 0,0,1,2
0,1,400,3
1,5,1200,7
2,8,1800,10


In [31]:
def loadStopWords(stopWordsFileName):
	stop_words=[]
	f=open(stopWordsFileName, 'r')
	for l in f.readlines():
		stop_words.append(l.replace('\n', ''))
	return stop_words

In [32]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [33]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer=WordNetLemmatizer()

In [34]:
runParams={'tfidf_maxdf':      [0.5],
           'input_file':       ['./data/articles.csv'],
           'story_threshold':  [0.26],
           'process_date':     ['2016-09-01'],
           'parts_of_speech':  [['PROPER', 'VERB']],
           'lemma_conversion': [False],
           'ngram_max':        [3],
           'tfidf_binary':     [False],
           'tfidf_norm':       ['l2'],
           'nlp_library':      ['nltk'],
           'max_length':       [50],
           'stop_words_file':  ['./data/stopWords.txt'],
           'tfidf_mindf':      [2],
           'display_graph':    [True],
           'article_stats':    [False]}

partsOfSpeech=[]

pos_nlp_mapping = {}
pos_nlp_mapping['nltk']={'VERB':['VB','VBD','VBG','VBN','VBP','VBZ'],'PROPER':['NNP','NNPS'],'COMMON':['NN','NNS']}

for pos in runParams['parts_of_speech'][0]:
    partsOfSpeech.append(pos_nlp_mapping['nltk'][pos])
partsOfSpeech=[item for sublist in partsOfSpeech for item in sublist]
print(partsOfSpeech)

['NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


In [35]:
import nltk as nl
stringToConvert = article_df['content']
partsOfSpeech = partsOfSpeech
stop_words = stop_words
lemmatizer = wordnet_lemmatizer

In [36]:
sentence = 'Hello, I am from England. What is your name?'
import nltk
from nltk import word_tokenize, sent_tokenize
sentences = nltk.sent_tokenize(sentence)
print(sentences)
words = nltk.word_tokenize(sentence)
print(words)

['Hello, I am from England.', 'What is your name?']
['Hello', ',', 'I', 'am', 'from', 'England', '.', 'What', 'is', 'your', 'name', '?']


In [37]:
article_df = df
article_df.head()
# article_df['content'][0]

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content,NYT summary,content no nonascii
0,3079,20694,"Review: In ‘Independence Day: Resurgence,' the...",NY Times,Manohla Dargis,2016-12-02,2016.0,12.0,,"If you've seen one movie apocalypse, you have ...",False,"If you've seen one movie apocalypse, you have ..."
1,3241,20875,The Agony of the Digital Tease - The NY Times,NY Times,Jessica Bennett,2016-12-02,2016.0,12.0,,There was the breadcrumb dropped on Valentine'...,False,There was the breadcrumb dropped on Valentine'...
2,3336,20986,Fox News's Convention Moment Overshadowed by S...,NY Times,Jim Rutenberg,2016-12-02,2016.0,12.0,,CLEVELAND — This was supposed to be Fox New...,False,CLEVELAND This was supposed to be Fox News...
3,3722,21413,"One Star Over, a Planet That Might Be Another ...",NY Times,Kenneth Chang,2016-09-01,2016.0,9.0,,Another Earth could be circling the star right...,False,Another Earth could be circling the star right...
4,3748,21448,University of Chicago Strikes Back Against Cam...,NY Times,"Richard Pérez-Peña, Mitch Smith and Stephanie ...",2016-09-01,2016.0,9.0,,The anodyne welcome letter to incoming freshme...,False,The anodyne welcome letter to incoming freshme...


In [44]:
def stringNLTKProcess(nl, stringToConvert,partsOfSpeech,stop_words,maxWords=None,lemmatizer=None):
    #parses the paragraph into sentences
    sentences = nl.sent_tokenize(stringToConvert)
    str = []
    for sentence in sentences:
        wordString=[]
        for word, pos in nl.pos_tag(nl.word_tokenize(sentence)):
            # The following condition avoids any POS which corresponds to punctuation (and takes all others
            if partsOfSpeech == None:
                if pos[0]>='A' and pos[0]<='Z':
                    wordString.append(word)
            elif pos in partsOfSpeech:
                wordString.append(word)
        for wrd in wordString:
            #converts all string characters into lowercase elements
            wrdlower=wrd.lower()
            if wrdlower not in stop_words and wrdlower!="'s'":
                if maxWords==None or len(str)<maxWords:
                    if lemmatizer==None:
                        str.append(wrdlower)
                    else:
                        str.append(lemmatizer.lemmatize(wrd.lower(), pos='v'))
            if maxWords!=None and len(str)==maxWords:
                return ' '.join(str)
    return ' '.join(str)

def removeSpacesAndPunctuation(textString): 
    return "".join(i for i in textString if (ord(i)>=48 and ord(i)<=57) or (ord(i)>=97 and ord(i)<=122))

In [39]:
article_df['input to vectorizer'] = article_df['content no nonascii'].map(lambda x: stringNLTKProcess(nl,x,partsOfSpeech,stop_words,maxWords=None,lemmatizer=wordnet_lemmatizer))

In [40]:
article_df['input to vectorizer'][0]

"'ve see see direct roland emmerich mr. emmerich carve mayan melt scale narrow white house mr. emmerich go go independence day independence day amuse remember white house blow funny contemplate say think mr. emmerich encourage engineer generate boom activate 's resurgence spur need know take cook hinge erupt put include mr. emmerich dean devlin sound much resurgence pop feature lift smith mr. smith decline appear leave independence day bill pullman jeff goldblum give judd hirsch brent spiner wink 're strand crowd give right mr. emmerich manage personalize sprinkle use clean seem try summon dredge feel glance independence day liam hemsworth punch mr. smith earth mr. hemsworth mr. smith independence day sell cut independence day rat parent caution bloodless run"

In [41]:
#testing out TfidVectorizer on sample content
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', ngram_range = (1,5), lowercase="true", binary="fales", norm="l2")

corpus = ['This is the first document.','This is the second second document.','And the third one.','Is this the first document?'] 
X = vectorizer.fit_transform(corpus)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', ngram_range = (1,5), lowercase="true", binary="fales", norm="l2")
tfidVectors = vectorizer.fit_transform(article_df['input to vectorizer'])
terms = vectorizer.get_feature_names()

In [43]:
tfidVectors

<664x442537 sparse matrix of type '<class 'numpy.float64'>'
	with 529960 stored elements in Compressed Sparse Row format>

In [49]:
tfidVectors_dense = tfidVectors.todense()
df_tfidfVectors = pd.DataFrame(tfidVectors_dense)
df_tfidfVectors.head()

df.to_csv(r'C:\Users\goldm\Capstone\tracking files\testing_tfidfVectors.csv')