In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer

# to track time
from time import time

from sklearn.model_selection import train_test_split

from collections import Counter

# Get data from CSV

In [5]:
df=pd.read_csv('../data/raw/wiki_movie_plots_deduped.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
Release Year        34886 non-null int64
Title               34886 non-null object
Origin/Ethnicity    34886 non-null object
Director            34886 non-null object
Cast                33464 non-null object
Genre               34886 non-null object
Wiki Page           34886 non-null object
Plot                34886 non-null object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


# Clean genre

Number of original entries: 34886

In [7]:
# how many with genre identified
df2=df[df['Genre']!='unknown']
df2=df2[df2['Genre']!='']

In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28803 entries, 6 to 34885
Data columns (total 8 columns):
Release Year        28803 non-null int64
Title               28803 non-null object
Origin/Ethnicity    28803 non-null object
Director            28803 non-null object
Cast                28064 non-null object
Genre               28803 non-null object
Wiki Page           28803 non-null object
Plot                28803 non-null object
dtypes: int64(1), object(7)
memory usage: 2.0+ MB


Number of entries after removing unknown and empty genres: 28803

In [9]:
print('Number of genres before cleaning: '+str(len(df2['Genre'].unique())))

Number of genres before cleaning: 2264


In [10]:
def clean_genre(genre):
    genre.strip()
    
    #only take first genre in list
    genre=genre.split(',')[0].strip()
    genre=genre.split('/')[0].strip()
    genre=genre.split('-')[0].strip()
    
    genre_word_set=set(genre.split())

    if 'romance comedy' in genre:
        genre='romantic comedy'
    elif 'romantic comedy' in genre:
        genre='romantic comedy'
    elif set(['sci ', 'sci', 'science']).intersection(genre_word_set):
        genre='science fiction'
    elif set(['drama', 'dystopia', 'revenge','tragedy', 'fiction', 'exploitation', 'biodrama','lgbt','politics', 'docudrama', 'biographical', 'biography','dramedy','biopic','melodrama', 'social', 'political', 'mythology', 'folklore']).intersection(genre_word_set):
        genre='drama'
    elif set(['anim', 'anime','animation', 'animated']).intersection(genre_word_set):
        genre='animated'
    elif set(['martial', 'superheroes', 'jidaigeki','chambara','tokusatsu', 'sword','samurai', 'kung', 'wuxia', 'superhero', 'action', 'p.o.w.', 'war', 'wwi', 'wwii']).intersection(genre_word_set):
        genre='action'
    elif set(['rom', 'romcom']).intersection(genre_word_set):
        genre='romantic comedy'
    elif set(['comedy', 'comedey', 'spoof', 'standup', 'slapstick', 'parody', 'satire']).intersection(genre_word_set):
        genre='comedy'
    elif set(['horror', 'fore', 'vampire', 'slasher', 'slahser']).intersection(genre_word_set):
        genre='horror'        
    elif set(['fantasy', 'fairy','mythological']).intersection(genre_word_set):
        genre='fantasy'        
    elif set(['adventure', 'adventures', 'serial','disaster', 'kaiju', 'swashbuckler']).intersection(genre_word_set):
        genre='adventure'  
    elif set(['romance', 'love','romantic']).intersection(genre_word_set):
        genre='romance'          
    elif set(['suspense', 'supernatural','espionage','thriller', 'spy']).intersection(genre_word_set):
        genre='thriller' 
    elif set(['mystery', 'mob','triad','heist', 'crime', 'cop','yakuza','gangster', 'noir', 'detective']).intersection(genre_word_set):
        genre='crime'
    elif set(['bio', 'costume','noir', 'period', 'shakespearean', 'histo', 'epic', 'history', 'historical']).intersection(genre_word_set):
        genre='epic/historical'
    elif set(['docu', 'footage','mockumentary', 'documentary']).intersection(genre_word_set):
        genre='documentary'  
    elif set(['music', 'opera', 'concert','operetta', 'musical','dance']).intersection(genre_word_set):
        genre='musicals/dance'
    elif set(['short']).intersection(genre_word_set):
        genre='short'
    elif set(['family', 'teen','children', 'children\'s']).intersection(genre_word_set):
        genre='family'
    elif set(['erotica','sexual', 'sexploitation','porno', 'adult', 'erotic', 'ero']).intersection(genre_word_set):
        genre='adult'
    elif set(['ball', 'boxing', 'sport', 'sports']).intersection(genre_word_set):
        genre='sports'
    elif set(['devotional','religious', 'christian','biblical']).intersection(genre_word_set):
        genre='drama'
    elif set(['cowboy', 'western']).intersection(genre_word_set):
        genre='western'  

    return genre 

In [11]:
# clean list of genre to reasonable list
df2['Genre_c']=df['Genre'].apply(clean_genre)

In [12]:
print('Number of genres after cleaning: '+str(len(df2['Genre_c'].unique())))

Number of genres after cleaning: 179


In [13]:
genre_cum_sum=df2['Genre_c'].value_counts().cumsum()

In [14]:
df2['Genre_c'].value_counts()[1:20]

comedy             5996
action             2704
crime              1425
horror             1378
thriller           1369
romance            1339
western             905
animated            859
science fiction     792
adventure           744
musicals/dance      563
romantic comedy     503
family              341
fantasy             322
epic/historical     196
documentary         120
short                61
adult                51
sports               42
Name: Genre_c, dtype: int64

In [15]:
threshold=df2['Title'].count()*0.95

In [16]:
top_genres=genre_cum_sum[genre_cum_sum<threshold].index.values

In [17]:
print('Num of low_genre entries: '+str(df2[~df2['Genre_c'].isin(top_genres)]['Title'].count()))

Num of low_genre entries: 1923


In [18]:
# drop categories that have low numbers (~95 percentile)
df3=df2[df2['Genre_c'].isin(top_genres)]

In [19]:
df3=df3[df3['Genre_c']!='']

In [20]:
print('Num of genre entries: '+str(df3['Genre_c'].unique().size))

Num of genre entries: 12


In [21]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26880 entries, 6 to 34885
Data columns (total 9 columns):
Release Year        26880 non-null int64
Title               26880 non-null object
Origin/Ethnicity    26880 non-null object
Director            26880 non-null object
Cast                26248 non-null object
Genre               26880 non-null object
Wiki Page           26880 non-null object
Plot                26880 non-null object
Genre_c             26880 non-null object
dtypes: int64(1), object(8)
memory usage: 2.1+ MB


# Feature: Sentiment per sentence matrix

In [22]:
vader_analyzer = SentimentIntensityAnalyzer()

In [23]:
df3['sentiment']=df3['Plot'].apply(lambda x: [vader_analyzer.polarity_scores(sentence)['compound'] 
                                            for sentence in x.split('.')])

In [24]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26880 entries, 6 to 34885
Data columns (total 10 columns):
Release Year        26880 non-null int64
Title               26880 non-null object
Origin/Ethnicity    26880 non-null object
Director            26880 non-null object
Cast                26248 non-null object
Genre               26880 non-null object
Wiki Page           26880 non-null object
Plot                26880 non-null object
Genre_c             26880 non-null object
sentiment           26880 non-null object
dtypes: int64(1), object(9)
memory usage: 2.3+ MB


In [25]:
df3.to_pickle('../data/interim/cleaned_data.pkl')

In [26]:
df3.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Genre_c,sentiment
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...,western,"[-0.2263, 0.0, -0.1531, -0.765, 0.0, 0.09, 0.0..."
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...,comedy,"[0.4215, -0.8555, 0.0]"
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...,action,"[0.25, -0.2732, 0.0]"
13,1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,https://en.wikipedia.org/wiki/Daniel_Boone_(19...,Boone's daughter befriends an Indian maiden as...,drama,"[0.0, -0.6597, 0.0516, 0.0644, -0.0772, 0.0, -..."
14,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...,comedy,"[0.3182, 0.0, 0.6486, 0.0, 0.4767, -0.6808, 0.0]"
