# ROTTEN_TOMATOES_REVIEWS.CSV CLEANING

In [33]:
import pandas as pd

In [34]:
data_directory = 'data_csv/'

reviews_df = pd.read_csv(data_directory+'rotten_tomatoes_reviews.csv')
reviews_df

Unnamed: 0,rotten_tomatoes_link,movie_title,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Percy Jackson & the Olympians: The Lightning T...,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,Percy Jackson & the Olympians: The Lightning T...,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Percy Jackson & the Olympians: The Lightning T...,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Percy Jackson & the Olympians: The Lightning T...,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...
...,...,...,...,...,...,...,...,...,...
1129882,m/zulu_dawn,Zulu Dawn,Chuck O'Leary,False,Fantastica Daily,Rotten,2/5,2005-11-02,
1129883,m/zulu_dawn,Zulu Dawn,Ken Hanke,False,"Mountain Xpress (Asheville, NC)",Fresh,3.5/5,2007-03-07,"Seen today, it's not only a startling indictme..."
1129884,m/zulu_dawn,Zulu Dawn,Dennis Schwartz,False,Dennis Schwartz Movie Reviews,Fresh,B+,2010-09-16,A rousing visual spectacle that's a prequel of...
1129885,m/zulu_dawn,Zulu Dawn,Christopher Lloyd,False,Sarasota Herald-Tribune,Rotten,3.5/5,2011-02-28,"A simple two-act story: Prelude to war, and th..."


In [35]:
reviews_df.shape

(1129887, 9)

In [36]:
reviews_df.dtypes

rotten_tomatoes_link    object
movie_title             object
critic_name             object
top_critic                bool
publisher_name          object
review_type             object
review_score            object
review_date             object
review_content          object
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [37]:
reviews_df['movie_title'] = reviews_df['movie_title'].astype('string')
reviews_df['critic_name'] = reviews_df['critic_name'].astype('string')
reviews_df['publisher_name'] = reviews_df['publisher_name'].astype('string')
reviews_df['review_type'] = reviews_df['review_type'].astype('string')
reviews_df['review_score'] = reviews_df['review_score'].astype('string')
reviews_df['review_date'] = pd.to_datetime(reviews_df['review_date'], format="%Y-%m-%d")
reviews_df['review_content'] = reviews_df['review_content'].astype('string')
reviews_df.dtypes

rotten_tomatoes_link            object
movie_title             string[python]
critic_name             string[python]
top_critic                        bool
publisher_name          string[python]
review_type             string[python]
review_score            string[python]
review_date             datetime64[ns]
review_content          string[python]
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMNS VALUES

In [38]:
columns_values = list(reviews_df.columns.values)
print(columns_values)

['rotten_tomatoes_link', 'movie_title', 'critic_name', 'top_critic', 'publisher_name', 'review_type', 'review_score', 'review_date', 'review_content']


In [39]:
print(list(reviews_df['review_type'].unique()))

['Fresh', 'Rotten']


Fresh: 60% and above – Indicates generally favorable reviews from critics.

Rotten: Below 60% – Indicates generally unfavorable reviews from critics.

In [40]:
# is there a way to normalise these values?
types_of_score = list(reviews_df['review_score'].dropna().unique())
types_of_score.sort()
print(types_of_score)

['0', '0.02/5', '0.05/4', '0.1/5', '0.13/1', '0.13/5', '0.2/5', '0.21/10', '0.25/4', '0.3/5', '0.4/5', '0.5/10', '0.5/4', '0.5/5', '0.5/5.5', '0.52/1', '0.58/1', '0.59/1', '0.6/1', '0.6/10', '0.65/10', '0.66/10', '0.664363/10', '0.666/4', '0.7221/10', '0.75/10', '0.75/5', '0.7765823/10', '0.8/10', '0.8/5', '0.81/1', '0.82/10', '0.85/10', '0.9/10', '0.96/10', '0/10', '0/100', '0/1000', '0/4', '0/5', '0/6', '1', '1.1/10', '1.1/2', '1.1/5', '1.2/10', '1.2/4', '1.2/5', '1.21/10', '1.24/5', '1.25/4', '1.25/5', '1.3/5', '1.35/5', '1.4/10', '1.4/4', '1.4/5', '1.5/10', '1.5/2', '1.5/3', '1.5/4', '1.5/5', '1.5/6', '1.5242228/10', '1.6/5', '1.62/5', '1.7/5', '1.75/10', '1.75/4', '1.75/5', '1.8/10', '1.8/4', '1.8/5', '1.82/10', '1.85/10', '1.88/10', '1.9/10', '1.9/5', '1.9528/10', '1/1', '1/10', '1/2', '1/3', '1/4', '1/5', '1/54', '1/6', '10', '10.50/20', '10/10', '10/100', '10/20', '10/90', '100/100', '11.5/20', '11/100', '11/20', '12.5/20', '12.50/20', '12/100', '12/20', '13.5/20', '13/100', '1

these scores are confusing. Rotten Tomatoes aggregates review scores from a variety of sources, including professional critics, publications, and audience ratings, which means each critic may use different types of rating, such as:
- letters: A, B, C etc.
- numbers: 2, 10, 77 etc.
- fractions: 4/5, 2/3, 10/90 etc.

In [41]:
reviews_df.loc[:, 'review_score'] = reviews_df['review_score'].str.replace(' ', '')

In [42]:
def normalize_score(score):

    if isinstance(score, str):
        # case when there's a denominator
        if '/' in score:
            num, den = map(float, score.split('/'))
            if den == 0: # if the num has 0 as denominator
                return num
            return (num / den) * 100
        elif score.isnumeric():
            return score

        letter_grades = { # normalise letters into nums
            'A+': 100, 'A': 95, 'A-': 90,
            'B+': 85, 'B': 80, 'B-': 75,
            'C+': 70, 'C': 65, 'C-': 60,
            'D+': 55, 'D': 50, 'D-': 45,
            'F': 30
        }
        return letter_grades.get(score)

In [43]:
reviews_df['normalized_review_score'] = reviews_df['review_score'].apply(normalize_score)
reviews_df['normalized_review_score'] = reviews_df['normalized_review_score'].astype(float)

## ARE THERE NA VALUES IN THE DATASET?

In [44]:
not_available_links = reviews_df['rotten_tomatoes_link'].isna().sum()
not_available_movie_titles = reviews_df['movie_title'].isna().sum()
not_available_critic_name = reviews_df['critic_name'].isna().sum()
not_available_top_critic = reviews_df['top_critic'].isna().sum()
not_available_publisher_name = reviews_df['publisher_name'].isna().sum()
not_available_review_type = reviews_df['review_type'].isna().sum()
not_available_review_score = reviews_df['review_score'].isna().sum()
not_available_normalized_view_score = reviews_df['normalized_review_score'].isna().sum()
not_available_review_date = reviews_df['review_date'].isna().sum()
not_available_review_content = reviews_df['review_content'].isna().sum()

In [45]:
print('Not available links: ', not_available_links,
      '\nNot available movie titles: ', not_available_movie_titles,
      '\nNot available critic name: ', not_available_critic_name,
      '\nNot available top critic: ', not_available_top_critic,
      '\nNot available publisher: ', not_available_publisher_name,
      '\nNot available review type: ', not_available_review_type,
      '\nNot available review score: ', not_available_review_score,
      '\nNot available normalized review score: ', not_available_normalized_view_score,
      '\nNot available review date: ', not_available_review_date,
      '\nNot available review content: ', not_available_review_content)

Not available links:  0 
Not available movie titles:  0 
Not available critic name:  18521 
Not available top critic:  0 
Not available publisher:  0 
Not available review type:  0 
Not available review score:  305902 
Not available normalized review score:  305902 
Not available review date:  0 
Not available review content:  65778


## ARE THERE ANY DUPLICATES?

In [46]:
num_duplicated_reviews = reviews_df.duplicated().sum()
print('There are a total of {} duplicates'.format(num_duplicated_reviews))

There are a total of 119471 duplicates


In [47]:
reviews_df.loc[reviews_df.duplicated(keep=False)]

Unnamed: 0,rotten_tomatoes_link,movie_title,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,normalized_review_score
35513,m/1069696-screamers,Screamers,Dave White,False,Movies.com,Fresh,B-,1996-01-26,,75.0
35514,m/1069696-screamers,Screamers,Dave White,False,Movies.com,Fresh,B-,1996-01-26,,75.0
35576,m/1069707-othello,Othello,Fred Topel,False,About.com,Fresh,4/5,2003-11-25,Fine Shakespeare adaptation,80.0
35577,m/1069707-othello,Othello,Fred Topel,False,About.com,Fresh,4/5,2003-11-25,Fine Shakespeare adaptation,80.0
41315,m/1087935-buena_vista_social_club,Buena Vista Social Club,,False,Film Threat,Fresh,4/5,2002-12-08,,80.0
...,...,...,...,...,...,...,...,...,...,...
955042,m/the_fog_of_war_eleven_lessons_from_the_life_...,The Fog of War: Eleven Lessons from the Life o...,,False,Film Threat,Fresh,4/5,2005-12-06,,80.0
959360,m/the_girl_with_the_dragon_tattoo_2009,The Girl with the Dragon Tattoo,,False,National Post,Fresh,3.5/4,2009-02-27,,87.5
959361,m/the_girl_with_the_dragon_tattoo_2009,The Girl with the Dragon Tattoo,,False,National Post,Fresh,3.5/4,2009-02-27,,87.5
1044907,m/together_2001,Together,,False,Film Threat,Fresh,4/5,2002-12-08,,80.0


In [48]:
reviews_df = reviews_df[~reviews_df.duplicated()]
reviews_df.loc[reviews_df.duplicated()]

Unnamed: 0,rotten_tomatoes_link,movie_title,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,normalized_review_score


In [49]:
reviews_df.loc[35513]

rotten_tomatoes_link       m/1069696-screamers
movie_title                          Screamers
critic_name                         Dave White
top_critic                               False
publisher_name                      Movies.com
review_type                              Fresh
review_score                                B-
review_date                1996-01-26 00:00:00
review_content                            <NA>
normalized_review_score                   75.0
Name: 35513, dtype: object

## ARE THERE ANY UNREASONABLE VALUES?

In [50]:
bigger_than_100 = reviews_df[reviews_df['normalized_review_score'] > 100]
bigger_than_100

Unnamed: 0,rotten_tomatoes_link,movie_title,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,normalized_review_score
44843,m/1098652-in_crowd,The In Crowd,Scott A. Mantz,False,Scott Mantz' Movie Reviews,Rotten,7/5,2000-01-01,If you're looking for an engaging movie with c...,140.0
111377,m/adoration,Adoration,Pam Grady,False,Boxoffice Magazine,Fresh,4.5/4,2009-04-23,Egoyan's finest in years.,112.5
112814,m/affliction,Affliction,Russell Smith,False,Austin Chronicle,Fresh,5.5/5,2000-01-01,,110.0
126588,m/american_beauty,American Beauty,Scott A. Mantz,False,Scott Mantz' Movie Reviews,Fresh,9/5,2000-01-01,You can't help but wonder how people find them...,180.0
152047,m/avengers_age_of_ultron,Avengers: Age of Ultron,Kevin A. Ranson,False,MovieCrypt.com,Fresh,3.5/3,2015-05-03,"... tops the original ... yet, somehow it felt...",116.666667
190365,m/bless_the_child,Bless the Child,Scott A. Mantz,False,Scott Mantz' Movie Reviews,Rotten,8/5,2000-01-01,Considering how jaded moviegoers are these day...,160.0
201589,m/boy_erased,Boy Erased,Tiffany Tchobanian,False,Film Threat,Fresh,9.5/5,2018-10-16,Boy Erased is a poignant family drama that exp...,190.0
202512,m/boys_dont_cry,Boys Don't Cry,Marjorie Baumgarten,False,Austin Chronicle,Fresh,5.5/5,2000-01-01,,110.0
238191,m/children_of_men,Children of Men,Lawrence Toppman,False,Charlotte Observer,Fresh,35/4,2007-01-04,It's as if Cuaron sees a future so hellish tha...,875.0
238833,m/chill_factor,Chill Factor,Scott A. Mantz,False,Scott Mantz' Movie Reviews,Rotten,9/5,2000-01-01,"Cuba, got a minute? We've gotta talk! Your car...",180.0


there are some wrongly typed fractions which lead to a normalized score bigger than what it should be (we are normalizing using /100)

In [51]:
# should we delete those values or round it to 100?
print('There are {} values which are bigger than 100.'.format(bigger_than_100['normalized_review_score'].count()))
print('On the other hand, there are {} values in total in the column.'.format(reviews_df['normalized_review_score'].count()))

There are 40 values which are bigger than 100.
On the other hand, there are 736845 values in total in the column.


from the prospective of a data analysis, losing 40 values is not that much since there are a lot more values to analyse, hence they are not that significative to our purpose. We could remove them and leave a NaN, since inserting a default value could result in unexpected outcomes.

In [52]:
import numpy as np
reviews_df.loc[reviews_df['normalized_review_score'] > 100, 'normalized_review_score'] = np.nan
reviews_df.loc[reviews_df['normalized_review_score'] > 100]

Unnamed: 0,rotten_tomatoes_link,movie_title,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,normalized_review_score


In [53]:
reviews_df

Unnamed: 0,rotten_tomatoes_link,movie_title,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,normalized_review_score
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...,
1,m/0814255,Percy Jackson & the Olympians: The Lightning T...,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...",
2,m/0814255,Percy Jackson & the Olympians: The Lightning T...,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...,
3,m/0814255,Percy Jackson & the Olympians: The Lightning T...,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...,70.0
4,m/0814255,Percy Jackson & the Olympians: The Lightning T...,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...,
...,...,...,...,...,...,...,...,...,...,...
1129882,m/zulu_dawn,Zulu Dawn,Chuck O'Leary,False,Fantastica Daily,Rotten,2/5,2005-11-02,,40.0
1129883,m/zulu_dawn,Zulu Dawn,Ken Hanke,False,"Mountain Xpress (Asheville, NC)",Fresh,3.5/5,2007-03-07,"Seen today, it's not only a startling indictme...",70.0
1129884,m/zulu_dawn,Zulu Dawn,Dennis Schwartz,False,Dennis Schwartz Movie Reviews,Fresh,B+,2010-09-16,A rousing visual spectacle that's a prequel of...,85.0
1129885,m/zulu_dawn,Zulu Dawn,Christopher Lloyd,False,Sarasota Herald-Tribune,Rotten,3.5/5,2011-02-28,"A simple two-act story: Prelude to war, and th...",70.0


In [55]:
print(reviews_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1010416 entries, 0 to 1129886
Data columns (total 10 columns):
 #   Column                   Non-Null Count    Dtype         
---  ------                   --------------    -----         
 0   rotten_tomatoes_link     1010416 non-null  object        
 1   movie_title              1010416 non-null  string        
 2   critic_name              993970 non-null   string        
 3   top_critic               1010416 non-null  bool          
 4   publisher_name           1010416 non-null  string        
 5   review_type              1010416 non-null  string        
 6   review_score             736845 non-null   string        
 7   review_date              1010416 non-null  datetime64[ns]
 8   review_content           951918 non-null   string        
 9   normalized_review_score  736805 non-null   float64       
dtypes: bool(1), datetime64[ns](1), float64(1), object(1), string(6)
memory usage: 78.1+ MB
None


## SAVING THE CLEANED DATASET

In [56]:
reviews_df.to_csv('data_cleaned/reviews_cleaned.csv', index=False)

In [61]:
reviews_df = None