In [39]:
##importing
import pandas as pd
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

In [40]:
# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

In [41]:
# URLs of transcripts in scope
urls = ['https://scrapsfromtheloft.com/2020/02/07/pretty-poison-review-pauline-kael/',
        'https://scrapsfromtheloft.com/2017/06/30/planet-apes-1968-review-pauline-kael/',
        'https://scrapsfromtheloft.com/2018/02/17/back-to-the-future-pauline-kael/',
        'https://scrapsfromtheloft.com/2018/01/12/quest-for-fire-pauline-kael/']

In [42]:
# movies names
movie= ['pretty', 'plant', 'backto', 'quest']

In [43]:
# # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

https://scrapsfromtheloft.com/2020/02/07/pretty-poison-review-pauline-kael/
https://scrapsfromtheloft.com/2017/06/30/planet-apes-1968-review-pauline-kael/
https://scrapsfromtheloft.com/2018/02/17/back-to-the-future-pauline-kael/
https://scrapsfromtheloft.com/2018/01/12/quest-for-fire-pauline-kael/


In [44]:
# # Pickle files for later use
# # Make a new directory to hold the text files
#!mkdir transcripts


In [45]:
for i, c in enumerate(movie):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [46]:
# Load pickled files
data = {}
for i, c in enumerate(movie):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [47]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['pretty', 'plant', 'backto', 'quest'])

In [48]:
# More checks
data['pretty'][:2]

['by Pauline Kael',
 'When I discovered that Pretty Poison had opened without advance publicity or screenings, I rushed to see it, because a movie that makes the movie companies so nervous they’re afraid to show it to the critics stands an awfully good chance of being an interesting movie. Mediocrity and stupidity certainly don’t scare them; talent does. This is a remarkable first feature film by a gifted young American, Noel Black — a movie that should have opened in an art house — and it was playing in a vast and empty theatre, from which, no doubt, it will depart upon the week. And the losses will be so heavy that the movie companies will use this picture as another argument against backing young American directors. The television ads for Pretty Poison are a pitiful attempt to make it seem strident and coarse and brutal — to attract teen-agers by passing it off as a cross between Psycho and Bonnie and Clyde. Those attracted this way are likely to hate the film. Pretty Poison simply 

In [49]:
# Let's take a look at our data again
next(iter(data.keys()))

'pretty'

In [50]:
# We are going to change this to key: CINEMAs, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [51]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [52]:
# We can either keep it in dictionary format or put it into a pandas dataframe
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
backto,"by Pauline Kael Back to the Future is a piece of Pop Art Americana, featuring Christopher Lloyd as a small-town crackpot inventor who putters in h..."
plant,"by Pauline Kael Planet of the Apes is a very entertaining movie, and you’d better go see it quickly, before your friends take the edge off it by t..."
pretty,"by Pauline Kael When I discovered that Pretty Poison had opened without advance publicity or screenings, I rushed to see it, because a movie that ..."
quest,"by Pauline Kael Eighty thousand years ago, on broad primeval plains, Naoh (Everett McGill), the bravest warrior of the spear-carrying Ulam tribe, ..."


In [53]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [54]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
backto,by pauline kael back to the future is a piece of pop art americana featuring christopher lloyd as a smalltown crackpot inventor who putters in his...
plant,by pauline kael planet of the apes is a very entertaining movie and you’d better go see it quickly before your friends take the edge off it by tel...
pretty,by pauline kael when i discovered that pretty poison had opened without advance publicity or screenings i rushed to see it because a movie that ma...
quest,by pauline kael eighty thousand years ago on broad primeval plains naoh everett mcgill the bravest warrior of the spearcarrying ulam tribe and two...


In [None]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

In [None]:
movie_names = ['BACK TO THE FUTURE', 'PLANET OF THE APES', 'Pretty Poison', 'QUEST FOR FIRE']

data_df['movie_names'] = movie_names
data_df

In [None]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

In [None]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [None]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

In [None]:
# Read in the document-term matrix
data = pd.read_pickle('dtm.pkl')
data = data.transpose()
data.head()

In [None]:
# Find the top 30 words said by each movie
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
# Print the top 15 words said by each movie
for movie, top_words in top_dict.items():
    print(movie)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

In [None]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each movie
words = []
for movie in data.columns:
    top = [word for (word, count) in top_dict[movie]]
    for t in top:
        words.append(t)
        
words

In [None]:
# Let's aggregate this list and identify the most common words along with how many routines they occur in
Counter(words).most_common()

In [None]:
# If more than half of the movie have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 6]
add_stop_words

In [None]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.transcript)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.index

# Pickle it for later use
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")

In [None]:
# Let's make some word clouds!
# Terminal / Anaconda Prompt: conda install -c conda-forge wordcloud
from wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

In [None]:
#data.columns

In [None]:
# Reset the output dimensions
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [16, 6]

movie_names = ['BACK TO THE FUTURE', 'PLANET OF THE APES', 'Pretty Poison', 'QUEST FOR FIRE']
#full_names = ['Ali Wong', 'Anthony Jeselnik', 'Bill Burr', 'Bo Burnham', 'Dave Chappelle', 'Hasan Minhaj',
 #             'Jim Jefferies', 'Joe Rogan', 'John Mulaney', 'Louis C.K.', 'Mike Birbiglia', 'Ricky Gervais']

# Create subplots for each movie
for index, movie in enumerate(data.columns):
    wc.generate(data_clean.transcript[movie])
    
    plt.subplot(3, 4, index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(movie_names[index])
    
plt.show()