Obtaining data with one-hot-encoded values

In [7]:
import pandas as pd
import numpy as np

In [12]:
# Load data
full_data_csv = "fulldataset_updated.csv"
full_data_df = pd.read_csv(full_data_csv)
full_data_df = full_data_df.rename(columns={"Unnamed: 0":"index"})
full_data_df.head()

Unnamed: 0,index,imdbid,title,metascore,script department,production companies,writers,directors,casting directors,cast,...,Biography,Comedy,Crime,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,0,118661,The Avengers,12.0,"Sharon Mansfield, Anna Worley","Warner Bros., Jerry Weintraub Productions","Sydney Newman, Don MacPherson",Jeremiah S. Chechik,Susie Figgis,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1,215545,Bamboozled,54.0,"Shari L. Carpenter, Carolyn De Sousa","New Line Cinema, 40 Acres & A Mule Filmworks",Spike Lee,Spike Lee,Aisha Coley,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,118715,The Big Lebowski,71.0,T. Kukovinski,"Polygram Filmed Entertainment, Working Title F...","Ethan Coen, Joel Coen","Joel Coen, Ethan Coen",John S. Lyons,"Jeff Bridges, John Goodman, Julianne Moore, St...",...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,112573,Braveheart,68.0,"Sally Jones, Kate Pakenham, Anna Worley","Icon Entertainment International, The Ladd Com...",Randall Wallace,Mel Gibson,Patsy Pollock,"James Robinson, Sean Lawlor, Sandy Nelson, Jam...",...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,88847,The Breakfast Club,66.0,Bob Forrest,"Universal Pictures, A&M Films, Channel Product...",John Hughes,John Hughes,Jackie Burch,"Emilio Estevez, Paul Gleason, Anthony Michael ...",...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# See data columns
full_data_df.columns

Index(['index', 'imdbid', 'title', 'metascore', 'script department',
       'production companies', 'writers', 'directors', 'casting directors',
       'cast', 'scenes', 'characters', 'percent dialogue', 'locations',
       'Positive', 'Anger', 'Disgust', 'Fear', 'Negative', 'Sadness',
       'Anticipation', 'Joy', 'Surprise', 'Trust', 'Top 3', 'Action',
       'Adventure', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy',
       'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller'],
      dtype='object')

In [14]:
# Data preprocessing
# One-hot-encoding categorical features: Values that appear at least 50 times

# Feature: "script department"
print("One-hot-encoding: Script department")
dummies_script = full_data_df["script department"].str.get_dummies(sep=", ") #splits the data in the column by "," and one hot encodes it
dummies_script.replace(0, np.nan, inplace=True) #replace 0s with nans so they can be counted and removed easily to keep only the most common ones
dummies_script = dummies_script.dropna(axis=1, thresh=50) #drop every column that doesn't show up at least 100 times
dummies_script.replace(np.nan, 0, inplace=True) #replace nans with 0s again
full_data_df = full_data_df.drop(columns=["script department"]) #drop the old column
full_data_df = full_data_df.join(dummies_script) #join the one hot encoded ones back to the main dataframe

# Feature: "production companies"
print("One-hot-encoding: Production companies")
dummies_production = full_data_df["production companies"].str.get_dummies(sep=", ") #splits the data in the column by "," and one hot encodes it
dummies_production.replace(0, np.nan, inplace=True) #replace 0s with nans so they can be counted and removed easily to keep only the most common ones
dummies_production = dummies_production.dropna(axis=1, thresh=50) #drop every column that doesn't show up at least 50 times
dummies_production.replace(np.nan, 0, inplace=True) #replace nans with 0s again
full_data_df = full_data_df.drop(columns=["production companies"]) #drop the old column
full_data_df = full_data_df.join(dummies_production) #join the one hot encoded ones back to the main dataframe

# Feature: "writers"
print("One-hot-encoding: Writers")
dummies_writers = full_data_df["writers"].str.get_dummies(sep=", ") #splits the data in the column by "," and one hot encodes it
dummies_writers.replace(0, np.nan, inplace=True) #replace 0s with nans so they can be counted and removed easily to keep only the most common ones
dummies_writers = dummies_writers.dropna(axis=1, thresh=50) #drop every column that doesn't show up at least 50 times
dummies_writers.replace(np.nan, 0, inplace=True) #replace nans with 0s again
full_data_df = full_data_df.drop(columns=["writers"]) #drop the old column
full_data_df = full_data_df.join(dummies_writers) #join the one hot encoded ones back to the main dataframe

# Feature: "directors"
print("One-hot-encoding: Directors")
dummies_directors = full_data_df["directors"].str.get_dummies(sep=", ") #splits the data in the column by "," and one hot encodes it
dummies_directors.replace(0, np.nan, inplace=True) #replace 0s with nans so they can be counted and removed easily to keep only the most common ones
dummies_directors = dummies_directors.dropna(axis=1, thresh=50) #drop every column that doesn't show up at least 50 times
dummies_directors.replace(np.nan, 0, inplace=True) #replace nans with 0s again
full_data_df = full_data_df.drop(columns=["directors"]) #drop the old column
full_data_df = full_data_df.join(dummies_directors) #join the one hot encoded ones back to the main dataframe

# Feature: "casting directors"
print("One-hot-encoding: Casting directors")
dummies_casting = full_data_df["casting directors"].str.get_dummies(sep=", ") #splits the data in the column by "," and one hot encodes it
dummies_casting.replace(0, np.nan, inplace=True) #replace 0s with nans so they can be counted and removed easily to keep only the most common ones
dummies_casting = dummies_casting.dropna(axis=1, thresh=50) #drop every column that doesn't show up at least 50 times
dummies_casting.replace(np.nan, 0, inplace=True) #replace nans with 0s again
full_data_df = full_data_df.drop(columns=["casting directors"]) #drop the old column
full_data_df = full_data_df.join(dummies_casting) #join the one hot encoded ones back to the main dataframe

# Feature: "cast"
print("One-hot-encoding: Cast")
dummies_cast = full_data_df["cast"].str.get_dummies(sep=", ") #splits the data in the column by "," and one hot encodes it
dummies_cast.replace(0, np.nan, inplace=True) #replace 0s with nans so they can be counted and removed easily to keep only the most common ones
dummies_cast = dummies_cast.dropna(axis=1, thresh=50) #drop every column that doesn't show up at least 50 times
dummies_cast.replace(np.nan, 0, inplace=True) #replace nans with 0s again
full_data_df = full_data_df.drop(columns=["cast"]) #drop the old column
full_data_df = full_data_df.join(dummies_cast) #join the one hot encoded ones back to the main dataframe

print("Done one-hot-encoding")

One-hot-encoding: Script department
One-hot-encoding: Production companies
One-hot-encoding: Writers
One-hot-encoding: Directors
One-hot-encoding: Casting directors
One-hot-encoding: Cast
Done one-hot-encoding


In [15]:
# See additional columns
full_data_df.columns

Index(['index', 'imdbid', 'title', 'metascore', 'scenes', 'characters',
       'percent dialogue', 'locations', 'Positive', 'Anger', 'Disgust', 'Fear',
       'Negative', 'Sadness', 'Anticipation', 'Joy', 'Surprise', 'Trust',
       'Top 3', 'Action', 'Adventure', 'Biography', 'Comedy', 'Crime', 'Drama',
       'Fantasy', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'Columbia Pictures', 'New Line Cinema', 'Paramount Pictures',
       'Twentieth Century Fox', 'Universal Pictures', 'Warner Bros.',
       'Francine Maisler', 'Mary Vernieu', 'Arnold Montey', 'Noah Segura'],
      dtype='object')

In [18]:
# Drop unnecessary columns
full_data_df = full_data_df.drop(["index", "imdbid", "title"], axis=1)

In [19]:
# Convert dataframe to csv file
full_data_df.to_csv("data_w_ohe.csv")