In [1]:
from pathlib import Path
import pandas as pd



In [2]:
# Read in starting data
massive_movies = pd.read_csv('massive_rotten_tomatoes_movies.csv')
massive_reviews = pd.read_csv('massive_rotten_tomatoes_movie_reviews.csv')

In [3]:
# Combine into one DataFrame, begin cleaning data
massive = pd.merge(massive_movies, massive_reviews, how='inner')

In [4]:
# Create column containing difference between critic and audience scores
massive['delta'] = massive['tomatoMeter'] - massive['audienceScore']

In [5]:
# Drop unneccesary columns, including input audience and critic scores
massive = massive.drop(columns=['originalScore', 'rating', 'ratingContents', 'releaseDateTheaters',
                                'releaseDateStreaming', 'runtimeMinutes', 'genre', 'originalLanguage',
                                'director', 'writer', 'boxOffice', 'distributor', 'soundMix', 'reviewUrl',
                                'id', 'reviewId', 'creationDate', 'isTopCritic', 'reviewState', 'tomatoMeter',
                                'audienceScore'])

In [6]:
# Check for nulls
massive.isnull().sum()

title               5470
criticName             0
publicatioName         0
reviewText         70284
scoreSentiment         0
delta             109854
dtype: int64

In [7]:
# Null data in these columns either indicate low quality data or will interfere with score calculations,
# drop all nulls
massive = massive.dropna()

In [8]:
#Conditionals to isolate reviews where critic scoree is at least 20% either above or below audience scores
condition = (massive['delta'] >= 20) & (massive['scoreSentiment'] == 'POSITIVE')

if condition.any():
    massive = massive.drop(massive[condition].index)

In [9]:
condition = (massive['delta'] <= -20) & (massive['scoreSentiment'] == 'NEGATIVE')

if condition.any():
    massive = massive.drop(massive[condition].index)

In [10]:
# Dropping duplicate reviews
massive = massive.drop_duplicates(subset='reviewText', keep='first')

In [11]:
# Output to csv for analysis and processing
massive.to_csv("massive.csv", index=False)

In [12]:
# Show output dataframe
massive

Unnamed: 0,title,criticName,publicatioName,reviewText,scoreSentiment,delta
2,Adrift,Josh Parham,Next Best Picture,This is nowhere near the level of other great ...,POSITIVE,4.0
3,Adrift,Cory Woodroof,615 Film,"Adrift somehow survives the late reveal, and t...",POSITIVE,4.0
4,Adrift,Kip Mooney,College Movie Review,"For almost the entire runtime, the only people...",NEGATIVE,4.0
5,Adrift,Robin Holabird,Robin Holabird,Shailene Woodley shows no fear diving on and o...,POSITIVE,4.0
6,Adrift,Richard Crouse,Richard Crouse,What is meant to be a voyage of self-discovery...,NEGATIVE,4.0
...,...,...,...,...,...,...
1469538,The Human Body,Bruce Feld,Film Journal International,"From beginning to end, The Human Body is a rar...",POSITIVE,18.0
1469539,The Human Body,Bridget Byrne,Boxoffice Magazine,The impression left is that of a series of sli...,NEGATIVE,18.0
1469540,The Human Body,Ross Anthony,Hollywood Report Card,"Though plenty of room for improvement exists, ...",POSITIVE,18.0
1469541,The Human Body,John Petrakis,Chicago Tribune,My guess is that The Human Body will inspire m...,POSITIVE,18.0
