#### Split articles into a dataframe

In [1]:
#Created a list of articles, each beginning with url
import re # regular expression 

with open("nytimes_news_articles.txt") as file:
    condition = "start"
    count = 0
    articles = []
    lines = []
    for line in file: 
#the line here is every small paragraph that does not change line.
# re.search() checks for a match anywhere in the string
        if re.search("^URL:\s+https?", line) != None:
            if condition == "change":
                
                articles.append(lines)
            
            lines = []
            # we are in the article not the url
            condition = "change"
        if condition == "change":
            # all lines are from the current article 
            lines.append(line)
            
        if count < 50:
            print(line)
        count += 1
#insert count into the placeholder "{}"
    print("lines {}".format(count)) 
    print("articles {}".format(len(articles)))

URL: http://www.nytimes.com/2016/06/30/sports/baseball/washington-nationals-max-scherzer-baffles-mets-completing-a-sweep.html



WASHINGTON — Stellar pitching kept the Mets afloat in the first half of last season despite their offensive woes. But they cannot produce an encore of their pennant-winning season if their lineup keeps floundering while their pitching is nicked, bruised and stretched thin.

“We were going to ride our pitching,” Manager Terry Collins said before Wednesday’s game. “But we’re not riding it right now. We’ve got as many problems with our pitching as we do anything.”

Wednesday’s 4-2 loss to the Washington Nationals was cruel for the already-limping Mets. Pitching in Steven Matz’s place, the spot starter Logan Verrett allowed two runs over five innings. But even that was too large a deficit for the Mets’ lineup to overcome against Max Scherzer, the Nationals’ starter.

“We’re not even giving ourselves chances,” Collins said, adding later, “We just can’t give our pi

In [2]:
articles[:3]

[['URL: http://www.nytimes.com/2016/06/30/sports/baseball/washington-nationals-max-scherzer-baffles-mets-completing-a-sweep.html\n',
  '\n',
  'WASHINGTON — Stellar pitching kept the Mets afloat in the first half of last season despite their offensive woes. But they cannot produce an encore of their pennant-winning season if their lineup keeps floundering while their pitching is nicked, bruised and stretched thin.\n',
  '“We were going to ride our pitching,” Manager Terry Collins said before Wednesday’s game. “But we’re not riding it right now. We’ve got as many problems with our pitching as we do anything.”\n',
  'Wednesday’s 4-2 loss to the Washington Nationals was cruel for the already-limping Mets. Pitching in Steven Matz’s place, the spot starter Logan Verrett allowed two runs over five innings. But even that was too large a deficit for the Mets’ lineup to overcome against Max Scherzer, the Nationals’ starter.\n',
  '“We’re not even giving ourselves chances,” Collins said, adding 

In [15]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def make_article(lines):
    article = {
        "url": None,
        "title": None,
        "date": None,
        "year": None,
        "month": None,
        "day": None,

        "category": None,
        "subcategories": [],
        "tags": [],
        "text": None
    }
    
    article["url"] = re.search("URL:\s+([^\s]+)", lines[0]).group(1)
    
    article["date"] = re.search("https?://[^/]+/((\d+)/(\d+)/(\d+))", article["url"]).group(1)
    
    article["year"] = int(re.search("(\d+)/(\d+)/(\d+)", article["date"]).group(1))
    article["month"] = int(re.search("(\d+)/(\d+)/(\d+)", article["date"]).group(2))
    article["day"] = int(re.search("(\d+)/(\d+)/(\d+)", article["date"]).group(3))
    
    article["category"] = re.search("https?://[^/]+/((\d+)/(\d+)/(\d+))/([^/]+)/", article["url"]).group(5)
    
    if re.match("https?://[^/]+/((\d+)/(\d+)/(\d+))/([^/]+)/(([^/]+/)+)", article["url"]) != None:
        article["subcategories"] = re.search("https?://[^/]+/((\d+)/(\d+)/(\d+))/([^/]+)/(([^/]+/)+)", article["url"]).group(6).split("/")[:-1]
    
    article["title"] = re.search("/([^/]+).html", article["url"]).group(1)
    
    article["tags"] = article["title"].split("-")
    
    article["text"] = "".join(lines[1:])
    
    article["text"] = clean_text(article["text"])
    
    return article

articles_proc = list(map(make_article, articles))

In [16]:
articles_proc[:3]

[{'url': 'http://www.nytimes.com/2016/06/30/sports/baseball/washington-nationals-max-scherzer-baffles-mets-completing-a-sweep.html',
  'title': 'washington-nationals-max-scherzer-baffles-mets-completing-a-sweep',
  'date': '2016/06/30',
  'year': 2016,
  'month': 6,
  'day': 30,
  'category': 'sports',
  'subcategories': ['baseball'],
  'tags': ['washington',
   'nationals',
   'max',
   'scherzer',
   'baffles',
   'mets',
   'completing',
   'a',
   'sweep'],
  'text': 'washington stellar pitching kept the mets afloat in the first half of last season despite their offensive woes but they cannot produce an encore of their pennantwinning season if their lineup keeps floundering while their pitching is nicked bruised and stretched thin we were going to ride our pitching manager terry collins said before wednesdays game but were not riding it right now weve got as many problems with our pitching as we do anything wednesdays 42 loss to the washington nationals was cruel for the alreadylim

In [17]:
# import relavent packages 
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import shuffle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [18]:
df = pd.DataFrame(articles_proc)

In [19]:
df.head()

Unnamed: 0,url,title,date,year,month,day,category,subcategories,tags,text
0,http://www.nytimes.com/2016/06/30/sports/baseb...,washington-nationals-max-scherzer-baffles-mets...,2016/06/30,2016,6,30,sports,[baseball],"[washington, nationals, max, scherzer, baffles...",washington stellar pitching kept the mets aflo...
1,http://www.nytimes.com/2016/06/30/nyregion/may...,mayor-de-blasios-counsel-to-leave-next-month-t...,2016/06/30,2016,6,30,nyregion,[],"[mayor, de, blasios, counsel, to, leave, next,...",mayor bill de blasios counsel and chief legal ...
2,http://www.nytimes.com/2016/06/30/nyregion/thr...,three-men-charged-in-killing-of-cuomo-administ...,2016/06/30,2016,6,30,nyregion,[],"[three, men, charged, in, killing, of, cuomo, ...",in the early morning hours of labor day last y...
3,http://www.nytimes.com/2016/06/30/nyregion/tek...,tekserve-precursor-to-the-apple-store-to-close...,2016/06/30,2016,6,30,nyregion,[],"[tekserve, precursor, to, the, apple, store, t...",it was the apple store in new york city before...
4,http://www.nytimes.com/2016/06/30/sports/olymp...,once-at-michael-phelpss-feet-and-still-chasing...,2016/06/30,2016,6,30,sports,[olympics],"[once, at, michael, phelpss, feet, and, still,...",omaha the united states olympic swimming trial...


In [34]:
df['date']= pd.to_datetime(df['date'])
print("The latest news was released on ",df["date"].max())
print("The oldest news was released on ",df["date"].min())

The latest news was released on  2016-06-30 00:00:00
The oldest news was released on  2016-02-24 00:00:00
