In [5]:
# Written By: Pratyush Singh

"""
This jupyter notebook scrapes the New York Times reviews, 
utilizes Extractive Text Summarization to summarize the text and return to the user 
"""


'\nThis jupyter notebook scrapes the New York Times reviews, \nutilizes Extractive Text Summarization to summarize the text and return to the user \n'

In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [7]:
'''
When scraping the links to the article (code for scraping in scrape.py), 
if a link was found then the text was placed with a dummy value of 'testing'.
'''

movies = pd.read_csv('nytimes.csv')
movies.head()

Unnamed: 0.1,Unnamed: 0,movie_title,article_link,text
0,0,Escape Room,http://www.nytimes.com/2019/01/03/movies/escap...,testing
1,1,Rust Creek,http://www.nytimes.com/2019/01/03/movies/rust-...,testing
2,2,American Hangman,-1,
3,3,A Dogs Way Home,-1,
4,4,The Upside,http://www.nytimes.com/2019/01/09/movies/the-u...,testing


In [8]:
movies.drop(columns=['Unnamed: 0'], inplace=True)

In [9]:
movies_links = movies[movies['article_link'] != "-1"]

In [10]:
movies_links.head()

Unnamed: 0,movie_title,article_link,text
0,Escape Room,http://www.nytimes.com/2019/01/03/movies/escap...,testing
1,Rust Creek,http://www.nytimes.com/2019/01/03/movies/rust-...,testing
4,The Upside,http://www.nytimes.com/2019/01/09/movies/the-u...,testing
10,The Kid Who Would Be King,http://www.nytimes.com/2019/01/24/movies/the-k...,testing
11,Serenity,http://www.nytimes.com/2019/01/24/movies/seren...,testing


In [27]:
# iterate through the links and scrape the text
def scrape_text(link):
    print(link)
    review = requests.get(link).text
    soup = BeautifulSoup(review, 'html.parser')

    body = soup.find("section", {"name": "articleBody"})
    p_tags = body.find_all("p")

    text = [tag.text for tag in p_tags]
    s = ' '
    text = s.join(text)
    
    return text

In [28]:
movies_links['text'] = movies_links['article_link'].apply(lambda x: scrape_text(x))

http://www.nytimes.com/2019/01/03/movies/escape-room-review.html
http://www.nytimes.com/2019/01/03/movies/rust-creek-review.html
http://www.nytimes.com/2019/01/09/movies/the-upside-review.html
http://www.nytimes.com/2019/01/24/movies/the-kid-who-would-be-king-review.html
http://www.nytimes.com/2019/01/24/movies/serenity-review.html
http://www.nytimes.com/2019/01/31/movies/miss-bala-review.html
http://www.nytimes.com/2019/01/30/movies/velvet-buzzsaw-review.html
http://www.nytimes.com/2019/01/31/movies/piercing-review.html
http://www.nytimes.com/2019/02/06/movies/the-lego-movie-two-the-second-part-review.html
http://www.nytimes.com/2019/02/07/movies/what-men-want-review.html
http://www.nytimes.com/2019/02/05/movies/cold-pursuit-review.html
http://www.nytimes.com/2019/02/07/movies/high-flying-bird-review.html
http://www.nytimes.com/2019/02/07/movies/the-prodigy-review.html
http://www.nytimes.com/2019/02/14/movies/happy-death-day-2u-review.html
http://www.nytimes.com/2019/02/12/movies/alit

http://www.nytimes.com/2019/09/19/movies/rambo-last-blood-review.html
http://www.nytimes.com/2019/09/26/movies/abominable-review.html
http://www.nytimes.com/2019/09/25/movies/the-laundromat-review.html
http://www.nytimes.com/2019/11/07/movies/playing-with-fire-review.html
http://www.nytimes.com/2019/10/03/movies/joker-review.html
http://www.nytimes.com/2019/10/03/movies/lucy-in-the-sky-review.html
http://www.nytimes.com/2019/10/02/movies/dolemite-is-my-name-review.html
http://www.nytimes.com/2019/10/03/movies/in-the-tall-grass-review.html
http://www.nytimes.com/2019/10/10/movies/little-monsters-review.html
http://www.nytimes.com/2019/10/09/movies/gemini-man-review.html
http://www.nytimes.com/2019/10/10/movies/the-addams-family-review.html
http://www.nytimes.com/2019/11/06/movies/the-kingmaker-review.html
http://www.nytimes.com/2019/10/11/movies/jexi-review.html
http://www.nytimes.com/2019/10/17/movies/maleficent-mistress-of-evil-review.html
http://www.nytimes.com/2019/10/16/movies/zomb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
movies_links.head()

Unnamed: 0,movie_title,article_link,text
0,Escape Room,http://www.nytimes.com/2019/01/03/movies/escap...,I tend to esteem motion pictures more for thei...
1,Rust Creek,http://www.nytimes.com/2019/01/03/movies/rust-...,"When horror movies head for the woods, their t..."
4,The Upside,http://www.nytimes.com/2019/01/09/movies/the-u...,What a difference a cast makes. If the directo...
10,The Kid Who Would Be King,http://www.nytimes.com/2019/01/24/movies/the-k...,"In 2011, the British writer-comedian Joe Corni..."
11,Serenity,http://www.nytimes.com/2019/01/24/movies/seren...,"I’m no actor, but I’d like to think if a scrip..."


In [30]:
movies_links.to_csv('movies_text.csv')

## Begin the Text Summarization Process

In [32]:
movies = pd.read_csv('movies_text.csv')
reviews = movies['text']

In [47]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [128]:
# 1) remove the stopwords
# 2) stem the words
# 3) make a frequency table 

def preprocess(review):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(review)    
    # stemmer = PorterStemmer()
    
    word_frequency = {}
    
    for word in words:
        word = word.lower()
        # word_stemmed = stemmer.stem(word)
        
        if word not in stop_words and word not in string.punctuation:
            if word in word_frequency.keys():
                word_frequency[word] = word_frequency[word] + 1
            else:
                word_frequency[word] = 1
    
    word_frequency = _normalize_table(word_frequency)
    
    return word_frequency


def _normalize_table(word_frequency):
    max_frequency = max(word_frequency.values())
    
    for word, freq in word_frequency.items():
        normalized_freq = round(freq / max_frequency, 2)
        word_frequency[word] = normalized_freq
    
    return word_frequency

In [117]:
def sentence_weight(review, word_frequency):
    sentence_scores = {}
    for sentence in nltk.sent_tokenize(review):
        for word in nltk.word_tokenize(sentence):
            if word in word_frequency.keys():
                if sentence in sentence_scores.keys():
                    sentence_scores[sentence] += word_frequency[word]
                else:
                    sentence_scores[sentence] = word_frequency[word]
    
    return sentence_scores

In [120]:
# sort the sentence weights, and I only want the top two sentences.
import heapq

def summarizer(review):
    word_frequency = preprocess(review)
    sentence_scores = sentence_weight(review, word_frequency)
    top_two_sentences = heapq.nlargest(2, sentence_scores, key=sentence_scores.get)
    
    s = ' '
    s = s.join(top_two_sentences)
    
    return s

In [130]:
words_freq = preprocess(movies[movies['movie_title'] == 'Uncut Gems']['text'].values[0])

In [133]:
import heapq

sorted_word_weights = heapq.nlargest(50, words_freq, key=words_freq.get)

sorted_word_weights

['’',
 'howard',
 '—',
 'safdies',
 '“',
 '”',
 'like',
 'uncut',
 'gems',
 'sandler',
 'man',
 'one',
 'stop',
 'two',
 'working',
 'much',
 'life',
 'movie',
 'men',
 'lots',
 'though',
 'opal',
 'garnett',
 'world',
 'feeling',
 'bad',
 'gambler',
 'makes',
 'also',
 'seem',
 'enough',
 'love',
 'stuff',
 'amid',
 'happens',
 'bedlam',
 'shooting',
 'almost',
 'ethiopia',
 'around',
 'pointless',
 'opener',
 'echoes',
 'exorcist',
 'dig',
 'relic',
 'possession',
 'plans',
 'short',
 'minutes']

In [100]:
movies['review_summary'] = movies['text'].apply(lambda review: summarizer(review)) 

In [105]:
movies.to_csv('movies_with_reviews.csv')

In [106]:
movies['review_summary']

0      There are intimations of “Tales From the Crypt...
1      Well-acted and technically sound (Michelle Law...
2      If the director Neil Burger’s decision to have...
3      It’s taken eight years for Cornish to release ...
4      It’s also possible that he read the script — w...
5      Directed by Catherine Hardwicke (“Thirteen,” “...
6      Gyllenhaal and Russo also starred in Gilroy’s ...
7      A grisly comedic thriller written and directed...
8      In this respect, this sales pitch fits in with...
9      Part of the message, of course, is that it doe...
10     After members of one of these outfits inject N...
11     On a basketball court, “give me the rock” mean...
12     If this isn’t evidence enough of Miles’ bad-se...
13     But the film is overstuffed with unfunny self-...
14     Some of this crash-boom stuff takes place duri...
15     As the movie opens, Berk is, in Hiccup’s estim...
16     Even when the jokes are as blunt as the bowlin...
17     “Christmas won’t be Chri

In [69]:
word_frequency = preprocess(movies_links.iloc[0]['text'])
sentence_scores = sentence_weight(movies_links.iloc[0]['text'], word_frequency)

In [70]:
sentence_scores

{'I tend to esteem motion pictures more for their aesthetic value than for their use value but sometimes there are exceptions.': 1.7000000000000002,
 'Through scrupulous and heightened simulations of terrifying reality, last year’s “First Man” reminded me why I never even entertained the notion of becoming an astronaut.': 5.249999999999998,
 'Taking the opposite tack with an irrational but not altogether implausible conceit, “Escape Room” reminds me why I’ll never engage in the newfangled form of entertainment in which you allow yourself to be “trapped” in a room and puzzle-solve your way out of it.': 7.829999999999998,
 'The conceit is that this movie’s game masters absolutely intend to kill the six invitees who at first find themselves in a waiting area that turns into a people-cooking oven.': 3.6900000000000013,
 'The players, mostly adult but still “Breakfast Club”-ish, include a female war veteran, an overachieving but friendless collegian, a dirtbag grocery stock boy and a too-pr

In [73]:
import heapq

sorted_ = heapq.nlargest(2, sentence_scores, key=sentence_scores.get)

In [74]:
sorted_

['There are intimations of “Tales From the Crypt,” “Final Destination,” “The Game,” and other older, better films here; this movie never catches a fire like any of those did, and even its twist coda feels dreary and pro forma.',
 'Taking the opposite tack with an irrational but not altogether implausible conceit, “Escape Room” reminds me why I’ll never engage in the newfangled form of entertainment in which you allow yourself to be “trapped” in a room and puzzle-solve your way out of it.']

In [76]:
movies_links.iloc[0]['article_link']

'http://www.nytimes.com/2019/01/03/movies/escape-room-review.html'

#### Formatting the Movie CSV

In [110]:
movies_with_review = pd.read_csv('movies_with_reviews.csv')
movies_with_review.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,movie_title,article_link,text,review_summary
0,0,0,Escape Room,http://www.nytimes.com/2019/01/03/movies/escap...,I tend to esteem motion pictures more for thei...,There are intimations of “Tales From the Crypt...
1,1,1,Rust Creek,http://www.nytimes.com/2019/01/03/movies/rust-...,"When horror movies head for the woods, their t...",Well-acted and technically sound (Michelle Law...
2,2,4,The Upside,http://www.nytimes.com/2019/01/09/movies/the-u...,What a difference a cast makes. If the directo...,If the director Neil Burger’s decision to have...
3,3,10,The Kid Who Would Be King,http://www.nytimes.com/2019/01/24/movies/the-k...,"In 2011, the British writer-comedian Joe Corni...",It’s taken eight years for Cornish to release ...
4,4,11,Serenity,http://www.nytimes.com/2019/01/24/movies/seren...,"I’m no actor, but I’d like to think if a scrip...",It’s also possible that he read the script — w...


In [111]:
movies_with_review.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'movie_title', 'article_link', 'text',
       'review_summary'],
      dtype='object')

In [113]:
movies_with_review.drop(['Unnamed: 0', 'Unnamed: 0.1'], inplace=True, axis=1)

In [115]:
movies_with_review.to_csv('movies_final.csv', index=False)