In [1]:
from sklearn.svm import SVC # I will be using Support Vector Machines as one method to evaluate the dataset and make predictions.
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import random
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import numpy as np
from seaborn import load_dataset, pairplot
from sklearn.ensemble import RandomForestRegressor # My second method of evaluation is random forest trees.
from sklearn.tree import export_graphviz
import pydot
import warnings
warnings.filterwarnings('ignore')

In [8]:
#def main():

basic_info = pd.read_csv('basics.tsv', sep = '\t')
rating_info = pd.read_csv('ratings.tsv', sep = '\t')

print(basic_info.shape)
print(rating_info.shape)

master_df = pd.merge(rating_info, basic_info, on = ["tconst"]) # We must join the basics and ratings tables on the unique movie ID ('tconst').
print(master_df.shape)

#if __name__ == '__main__':
#	main()

(9813254, 9)
(1307201, 3)
(1307199, 11)
<bound method NDFrame.head of             tconst  averageRating  numVotes  titleType  \
0        tt0000001            5.7      1966      short   
1        tt0000002            5.8       264      short   
2        tt0000003            6.5      1809      short   
3        tt0000004            5.6       178      short   
4        tt0000005            6.2      2608      short   
...            ...            ...       ...        ...   
1307194  tt9916730            8.3        10      movie   
1307195  tt9916766            7.0        21  tvEpisode   
1307196  tt9916778            7.2        36  tvEpisode   
1307197  tt9916840            8.8         6  tvEpisode   
1307198  tt9916880            8.2         6  tvEpisode   

                       primaryTitle               originalTitle isAdult  \
0                        Carmencita                  Carmencita       0   
1            Le clown et ses chiens      Le clown et ses chiens       0   
2       

In [24]:
 # Let's take a look at some of our data.

print(master_df.head)

# endYear seems to be exclusively newline values. We will remove that column.
working_df = master_df.copy()
working_df = working_df.drop("endYear", axis = 1)

# Other columns, mainly startYear, runtimeMinutes, and genres seem to have some newline values. Let's remove the rows that have these values.
remove_newlines = working_df[working_df["startYear"] == '\\N'].index
working_df.drop(remove_newlines, inplace = True)

remove_newlines = working_df[working_df["runtimeMinutes"] == '\\N'].index
working_df.drop(remove_newlines, inplace = True)

remove_newlines = working_df[working_df["genres"] == '\\N'].index
working_df.drop(remove_newlines, inplace = True)

print(working_df.shape)


<bound method NDFrame.head of             tconst  averageRating  numVotes  titleType  \
0        tt0000001            5.7      1966      short   
1        tt0000002            5.8       264      short   
2        tt0000003            6.5      1809      short   
3        tt0000004            5.6       178      short   
4        tt0000005            6.2      2608      short   
...            ...            ...       ...        ...   
1307194  tt9916730            8.3        10      movie   
1307195  tt9916766            7.0        21  tvEpisode   
1307196  tt9916778            7.2        36  tvEpisode   
1307197  tt9916840            8.8         6  tvEpisode   
1307198  tt9916880            8.2         6  tvEpisode   

                       primaryTitle               originalTitle isAdult  \
0                        Carmencita                  Carmencita       0   
1            Le clown et ses chiens      Le clown et ses chiens       0   
2                    Pauvre Pierrot             

In [25]:
# Now we must convert some of the columns to numerical values. The first column to change is titleType.
print(working_df.nunique()) # There are 10 unique titleTypes.

title_types = set(working_df["titleType"])
print(title_types) # {'movie', 'tvMovie', 'tvSpecial', 'videoGame', 'tvShort', 'tvEpisode', 'short', 'tvSeries', 'video', 'tvMiniSeries'}

title_dictionary = {'movie' : 0, 'tvMovie' : 1, 'tvSpecial' : 2, 'videoGame' : 3, 'tvShort' : 4, 'tvEpisode' : 5, 
                    'short' : 6, 'tvSeries' : 7, 'video' : 8, 'tvMiniSeries' : 9}
working_df['titleType'] = working_df['titleType'].apply(lambda x: title_dictionary[x])

title_types = set(working_df["titleType"])
print(title_types)


tconst            925526
averageRating         91
numVotes           21111
titleType             10
primaryTitle      717510
originalTitle     731461
isAdult                4
startYear            144
runtimeMinutes       712
genres              1949
dtype: int64
{'movie', 'tvMovie', 'tvSpecial', 'videoGame', 'tvShort', 'tvEpisode', 'short', 'tvSeries', 'video', 'tvMiniSeries'}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


In [27]:
# isAdult is supposed to be a boolean field yet it has 4 different values. Let's see what is happening here.

isAdult_types = set(working_df["isAdult"])
print(isAdult_types) # {0, 1, '0', '1'}

isAdult_dictionary = {0 : 0, '0': 0, 1 : 1, '1' : 1}
working_df['isAdult'] = working_df['isAdult'].apply(lambda x: isAdult_dictionary[x])

isAdult_types = set(working_df["isAdult"])
print(isAdult_types)


{0, 1, '0', '1'}
{0, 1}


In [58]:
# The genres feature has 1,949 unique values. It is very impractical to go through all 1,949 different values and create a dictionary.
# However, we can still use the feature. Let us determine how many genres a work has and change the field to represent that.
# We can do this by counting the number of commas in each entry. The max number of genres is 3.
#i = 0
print(working_df.shape)
for work, row in working_df.iterrows():
  #print(row["genres"])
  genre_list = row["genres"].split(",")
  #print(genre_list)
  print(len(genre_list))

  if len(genre_list) == 1:
    row["genres"] = 1
    print("in here")

  elif len(genre_list) == 2:
    row["genres"] = 2

  else:
    row["genres"] = 3
  #i += 1
genre_types = set(working_df["genres"])
print(genre_types)

(925526, 10)
2
2
3
2
2
1
in here
2
2
1
in here
2
2
2
2
2
2
2
2
1
in here
3
2
2
2
2
2
2
2
2
1
in here
3
2
2
2
1
in here
3
1
in here
2
1
in here
1
in here
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
2
3
1
in here
2
2
2
3
2
3
2
2
2
2
1
in here
2
2
1
in here
2
3
2
2
2
2
2
2
1
in here
2
1
in here
1
in here
3
2
1
in here
3
1
in here
3
2
1
in here
2
3
2
2
3
2
2
3
2
2
3
2
3
3
2
3
2
1
in here
2
3
2
2
1
in here
3
3
1
in here
2
2
1
in here
2
2
2
2
1
in here
3
2
3
3
2
3
2
2
2
3
3
3
3
3
3
2
3
3
3
2
2
2
2
2
2
2
2
3
2
2
3
3
2
2
2
2
3
3
2
2
2
2
2
3
3
2
2
3
3
2
2
2
2
3
2
2
2
2
2
3
3
2
2
2
2
3
2
3
3
3
3
2
2
2
3
3
2
2
2
3
2
2
3
3
1
in here
3
2
3
3
2
3
3
3
3
2
3
3
1
in here
2
2
1
in here
1
in here
2
3
2
3
2
2
3
2
2
1
in here
3
2
2
2
2
2
2
2
2
3
2
2
2
2
2
3
2
2
3
3
3
2
1
in here
2
2
3
2
2
3
2
2
3
2
3
3
2
2
3
3
2
2
3
2
2
2
2
3
2
2
2
2
2
3
2
2
3
2
2
3
1
in here
2
2
2
2
1
in here
3
2
2
2
3
2
2
2
2
3
2
2
2
2
3
2
2
2
1
in here
2
2
1
in here
2
2
2
2
2
2
2
3
3
2
1
in here
1
in here
2
2
2
2
2
1
in here
2
2
2
3
1
in here
2
2
2

KeyboardInterrupt: ignored