In [1]:
import requests
from bs4 import BeautifulSoup
import bs4
import json
from collections import defaultdict
import pandas as pd
from datetime import datetime
import codecs
from sklearn.preprocessing import MultiLabelBinarizer
import wget
import os

In [2]:
dataframe_columns = ['type','dateCreated', 'datePublished','reviewBody','url','author_type','author_name','url','content_type','movie_name','review_type','ratingValue','tomatometer']

#fields extracted from html page
review_fields = ['@type','dateCreated', 'datePublished','reviewBody','url']
author_fields = ['@type','name','url']
content_fields = ['@type','name']
rating_fields = ['@type','ratingValue','tomatometer']

#movie features
features = ['Rated','Runtime','Genre','Director','Writer','Actor','Plot','Language','Country','Poster','Production','Response']
feature_index = [2,4,5,6,7,8,9,10,11,13,22,24]
feature_dict = dict(zip(feature_index,features))



In [6]:
reader = codecs.getreader('utf-8')

In [7]:
#get page html and parse using beautiful soup
def get_page(page_num):
    URL = "https://www.rottentomatoes.com/source-337?page={}".format(page_num)
    page = requests.get(URL)

    soup = BeautifulSoup(page.content,"html.parser")
    return soup

In [8]:
#convert page data to pandas dataframe
def page_to_df(soup_data):
    data = []
    
    reviews = soup_data.find("script",{"type":"application/ld+json"}).contents[0] #extract unlcean review data
    reviews = reviews.strip() #remove new line characters
    reviews = json.loads(reviews) #convert to dictionary
    reviews = reviews['@graph'][1]['itemListElement'] #extract clean review data
    
    for review in reviews:
        rows = []
        for re_f in review_fields:
            rows.append(review['item'][re_f])
        for au_f in author_fields:
            rows.append(review['item']['author'][au_f])
        for co_f in content_fields:
            rows.append(review['item']['itemReviewed'][co_f])
        for ra_f in rating_fields:
            rows.append(review['item']['reviewRating'][ra_f])
        data.append(rows)
    
    return pd.DataFrame(data,columns = dataframe_columns)

In [9]:
def get_movie_id(movie):
    res = ia.search_move(movie)
    for r in res:
        if r['title'] == Movie:
            return r['']

In [10]:
#loop through rotten tomatoes pages and get data
all_reviews = []
total_review_pages = 378
for page in range(1,total_review_pages+1):
    try:
        all_reviews.append(page_to_df(get_page(page)))
    except:
        pass

In [11]:
def call_api(movie,year):
    url = "http://www.omdbapi.com/?apikey=69c24e49&t={}&y={}&type=movie".format(movie,year)
    response = requests.get(url).json()
    return response

In [12]:
def call_api_image(movie,year):
    url = "http://img.omdbapi.com/?apikey=&t={}&y={}".format(movie,year)
    

In [13]:
#account for writer vs writers, actor vs actors in api responses etc.
def data_reformat(res):
    curr = []
    for f in features:
        if(f in res.keys()):
            curr.append(res[f])
        elif(f+"s" in res.keys()):
            curr.append(res[f+"s"])
        else:
            curr.append("N/A")
    return dict(zip(features,curr))

In [14]:
#compile scraped dataframes into a single frame and export as csv
total_reviews = pd.concat(all_reviews)
total_reviews.to_csv("total_reviews.csv",index = False)

In [15]:
data = pd.read_csv('total_reviews.csv')

In [16]:
#get the year the movie came out using the strptime function
data['year'] = data['dateCreated'].apply(lambda date: datetime.strptime(date,'%b %d, %Y').year)

In [17]:
#get additional features from omdb api 
extra_data = []
for n in range(len(data)):
    extra_data.append(
        call_api(
            data['movie_name'][n],
            data['year'][n]
        )
    )
    print(n,end='')

0123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369

2326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575

4377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626

6427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676

8477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726

1042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620

1206112062120631206412065120661206712068120691207012071120721207312074120751207612077120781207912080120811208212083120841208512086120871208812089120901209112092120931209412095120961209712098120991210012101121021210312104121051210612107121081210912110121111211212113121141211512116121171211812119121201212112122121231212412125121261212712128121291213012131121321213312134121351213612137121381213912140121411214212143121441214512146121471214812149121501215112152121531215412155121561215712158121591216012161121621216312164121651216612167121681216912170121711217212173121741217512176121771217812179121801218112182121831218412185121861218712188121891219012191121921219312194121951219612197121981219912200122011220212203122041220512206122071220812209122101221112212122131221412215122161221712218122191222012221122221222312224122251222612227122281222912230122311223212233122341223512236122371223812239122401224112242122431224412245122461224712248122491225012251122521225312254122551225612257122581225912260

In [18]:
reformat_data = []
for ed in extra_data:
    reformat_data.append(data_reformat(ed))
    

In [19]:
#make list of dictionaries into a pandas dataframe
feature_df = pd.DataFrame(reformat_data)

In [20]:
#combine total data and feature data
data_total = pd.concat([data,feature_df],axis=1)

In [21]:
data_total

Unnamed: 0,type,dateCreated,datePublished,reviewBody,url,author_type,author_name,url.1,content_type,movie_name,...,Director,Writer,Actor,Plot,Language,Country,Poster,Production,Response,imdbID
0,Review,"Jul 23, 2021","Jul 23, 2021",Are we meant to recoil from sharks or care for...,https://www.nytimes.com/2021/07/23/movies/play...,Person,Natalia Winkelman,https://www.rottentomatoes.com/critic/natalia-...,Movie,Playing With Sharks,...,Sally Aitken,Sally Aitken,Valerie Taylor,"Pioneering scuba diver Valerie Taylor, who has...",English,United States,https://m.media-amazon.com/images/M/MV5BNzFhZG...,,True,tt11226258
1,Review,"Jul 22, 2021","Jul 22, 2021",The film excels when it harnesses the wistful ...,https://www.nytimes.com/2021/07/22/movies/all-...,Person,Isabelia Herrera,https://www.rottentomatoes.com/critic/isabelia...,Movie,All the Streets Are Silent: The Convergence of...,...,Jeremy Elkin,"Dana Brown, Jeremy Elkin","Rosario Dawson, Leo Fitzpatrick, Moby",Trailer 1,English,United States,https://m.media-amazon.com/images/M/MV5BMTExMm...,,True,tt12825680
2,Review,"Jul 22, 2021","Jul 22, 2021","Shyamalan's fluid filmmaking style, outstandin...",https://www.nytimes.com/2021/07/22/movies/old-...,Person,Glenn Kenny,https://www.rottentomatoes.com/critic/glenn-kenny,Movie,Old,...,M. Night Shyamalan,"M. Night Shyamalan, Pierre-Oscar Lévy, Frederi...","Gael García Bernal, Vicky Krieps, Rufus Sewell",A thriller about a family on a tropical holida...,English,United States,https://m.media-amazon.com/images/M/MV5BZGMxYm...,"Universal Pictures, Blinding Edge Pictures",True,tt10954652
3,Review,"Jul 22, 2021","Jul 22, 2021","Wignot layers images, video and - most importa...",https://www.nytimes.com/2021/07/22/movies/aile...,Person,Gia Kourlas,https://www.rottentomatoes.com/critic/gia-kourlas,Movie,Ailey,...,Jamila Wignot,,"Judith Jamison, Alvin Ailey, Bill T. Jones",An immersive portrait of dance pioneer Alvin A...,English,United States,https://m.media-amazon.com/images/M/MV5BZDAzOW...,,True,tt13622084
4,Review,"Jul 22, 2021","Jul 22, 2021",Often as thorny as its subject but also oddly ...,https://www.nytimes.com/2021/07/22/movies/char...,Person,Lena Wilson,https://www.rottentomatoes.com/critic/lena-wilson,Movie,Charlatan,...,Colin Bowles,Colin Bowles (Writer),"Larissa White, Haley Evans, Courtney Rikki Gre...",A bright law student hosts a blissful weekend ...,English,USA,,,True,tt14361772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13545,Review,"Jan 1, 2000","Jan 1, 2000",This would-be spicy film has been made blandly...,,Person,Elvis Mitchell,https://www.rottentomatoes.com/critic/elvis-mi...,Movie,Woman on Top,...,Fina Torres,Vera Blasi,"Penélope Cruz, Murilo Benício, Harold Perrinea...",Brazilian chef Isabella Oliveira moves to San ...,"English, Portuguese",USA,https://m.media-amazon.com/images/M/MV5BODc2ND...,Fox Searchlight,True,tt0206420
13546,Review,"Jan 1, 2000","Jan 1, 2000",Don't bother to hang around for the outtakes. ...,http://movies.nytimes.com/movie/review?res=940...,Person,Lawrence Van Gelder,https://www.rottentomatoes.com/critic/lawrence...,Movie,Dirty Work,...,,,,,,,,,False,
13547,Review,"Jan 1, 2000","Jan 1, 2000","For all its clever notions, Book of Shadows of...",,Person,Stephen Holden,https://www.rottentomatoes.com/critic/stephen-...,Movie,Book of Shadows: Blair Witch 2,...,Joe Berlinger,"Daniel Myrick, Eduardo Sánchez, Dick Beebe","Jeffrey Donovan, Stephen Barker Turner, Erica ...","A group of tourists arrives in Burkittsville, ...","English, German",United States,https://m.media-amazon.com/images/M/MV5BY2YzNz...,"Artisan Entertainment, Haxan Entertainment",True,tt0229260
13548,Review,"Jan 1, 2000","Jan 1, 2000",Chill Factor has no aspirations to be anything...,,Person,Stephen Holden,https://www.rottentomatoes.com/critic/stephen-...,Movie,Chill Factor,...,,,,,,,,,False,


In [24]:
#filter only english movies to keep the data uniform
data_total_english = data_total[data_total['Language'].str.contains('English')]

In [25]:
#filter to only studio movies to further enforce uniformity and simplify the problem 
data_total_english_studio = data_total_english[data_total_english['Production'] != 'N/A']

In [28]:
data_total_english_studio = data_total_english_studio.reset_index(drop = True)

In [29]:
data_total_english_studio.to_csv("data_total_english_studio.csv",index = False)

In [2]:
data_total_english_studio = pd.read_csv("data_total_english_studio.csv")

In [11]:
#download all posters, remove ? and / from name to stop errors
for n in range(len(data_total_english_studio)):
    if not data_total_english_studio['Poster'][n] == "N/A":
        try:
            wget.download(data_total_english_studio['Poster'][n],out = "poster/{}.jpg".format(data_total_english_studio['movie_name'][n].replace("?","").replace("/","")))
        except:
            pass

100% [..............................................................................] 32395 / 32395

In [12]:
#remove empty poster images
for file in os.listdir("poster"):
    if(not ".jpg" in file):
        os.remove("poster/{}".format(file))

In [15]:
#remove any rows where there is a N/A
for col in ['Rated','Runtime','Genre','Director','Writer','Actor','Plot','Country']:
    data_total_english_studio = data_total_english_studio[data_total_english_studio[col] != "N/A"]

In [32]:
#check which movies do not have a poster
poster = []
posters = [x.replace(".jpg","") for x in os.listdir("C:/Users/16789/Documents/MovieProject/poster")]
for m in data_total_english_studio['movie_name']:
    if(not m.replace("?","").replace("/","") in posters):
        poster.append(False)
    else:
        poster.append(True)
data_total_english_studio['poster'] = poster

In [33]:
#only keep movies with poster
data_total_english_studio = data_total_english_studio[data_total_english_studio['poster'] == True]

In [34]:
#save as csv
data_total_english_studio.to_csv("data_total_english_studio_final.csv",index = False)