# Content Based Recommendation Model

By: Anat Jacobson and Stephanie Ciaccia

---

## Overview

Building a content based recommendation system

# Business Problem

Add business problem here

# Data Understanding

In [34]:
#importing packages

In [778]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
import datetime

import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
%matplotlib inline
import plotly
import plotly.express as px
from matplotlib.ticker import StrMethodFormatter

from collections import Counter
from nltk.corpus import stopwords

from IPython.display import display

Function for printing long lists

In [36]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

Setting display option to view all columns

In [512]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [517]:
pd.reset_option("display.max_colwidth")

### Data Source : Grouplens

The data used in our analysis was obtained from [Movie Lens]([http://movielens.org]), which consists of user ratings on movies using a 5-star scale. The dataset was collected between March 29, 1996, and September 24, 2018, and includes ratings from 610 users. In total, the dataset contains 100836 ratings and 3683 tags across 9742 movies.

Files from the dataset inclue `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`.

### Import CSVs, Merge, and Clean all Data Needed
#### 4 Datasets imported:

- Movies:
- Links:
- Ratings:
- Tags:

In [355]:
#importing datasets
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv")
links_df = pd.read_csv("data/links.csv")
tags_df = pd.read_csv("data/tags.csv")

In [356]:
#checking the shape of all dfs
print('movies_df: ', movies_df.shape)
print('links_df: ',links_df.shape)
print('ratings_df: ',ratings_df.shape)
print('tags_df: ',tags_df.shape)

movies_df:  (9742, 3)
links_df:  (9742, 3)
ratings_df:  (100836, 4)
tags_df:  (3683, 4)


In [357]:
tag_group_df = tags_df.groupby(by="movieId").agg({"tag":" ".join})

In [358]:
#we would want to run a vectorizer and clean data
tag_group_df.value_counts()

tag                                                               
In Netflix queue                                                      109
Disney                                                                 15
aliens                                                                 10
Shakespeare                                                            10
Stephen King                                                            9
                                                                     ... 
death penalty Nun                                                       1
death penalty John Grisham                                              1
deafness                                                                1
darth vader luke skywalker space opera                                  1
"artsy" atmospheric gritty hallucinatory surreal visually stunning      1
Length: 1038, dtype: int64

In [359]:
#splitting genre into multiple columns
genre_split = pd.DataFrame(movies_df.genres.str.split('|',6).tolist(),
                                 columns = ['genre_1','genre_2', 'genre_3',
                                            'genre_4', 'genre_5', 'genre_6', 'genre_7'])

#merging dataframes
movies_df = pd.merge(pd.DataFrame(movies_df), pd.DataFrame(genre_split), left_index=True, right_index=True)

#dropping genre columns
movies_df.drop(columns="genres", inplace=True)

In [360]:
#looking at unique genre names
genre_split['genre_2'].unique()

array(['Animation', 'Children', 'Romance', 'Drama', None, 'Crime',
       'Adventure', 'Horror', 'Comedy', 'Sci-Fi', 'War', 'Thriller',
       'Mystery', 'Film-Noir', 'Fantasy', 'Musical', 'Western', 'IMAX',
       'Documentary'], dtype=object)

In [361]:
#finding unique genres and saving to list to one hot encode genres
genre_list = genre_split['genre_2'].unique().tolist()

#appending additional genres not in this column
genre_list.append('Fantasy')
genre_list.append('Action')

#removing none value in list
genre_list.remove(None)

In [362]:
genre_list

['Animation',
 'Children',
 'Romance',
 'Drama',
 'Crime',
 'Adventure',
 'Horror',
 'Comedy',
 'Sci-Fi',
 'War',
 'Thriller',
 'Mystery',
 'Film-Noir',
 'Fantasy',
 'Musical',
 'Western',
 'IMAX',
 'Documentary',
 'Fantasy',
 'Action']

In [363]:
# looping over genres in genre_list
for genre in genre_list:
    # create new columns with genre names in genre_list
    new_col = genre
    
    # make new column with 0 for all values (row will be made up of bool values)
    movies_df[new_col] = 0
    
    # looping over columns and rows and check if value is equal to genre in list - if it's equal we add value to 1
    for col in movies_df.columns:
        for i in range(len(movies_df)):
            if movies_df[col][i] == genre:
                movies_df[new_col][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df[new_col][i] = 1


In [364]:
#sanity check of for loop to make sure it worked correctly
movies_df.head(10)

Unnamed: 0,movieId,title,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,Animation,Children,Romance,Drama,Crime,Adventure,Horror,Comedy,Sci-Fi,War,Thriller,Mystery,Film-Noir,Fantasy,Musical,Western,IMAX,Documentary,Action
0,1,Toy Story (1995),Adventure,Animation,Children,Comedy,Fantasy,,,1,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
1,2,Jumanji (1995),Adventure,Children,Fantasy,,,,,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy,Romance,,,,,,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy,Drama,Romance,,,,,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,,,,,,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,6,Heat (1995),Action,Crime,Thriller,,,,,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
6,7,Sabrina (1995),Comedy,Romance,,,,,,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,8,Tom and Huck (1995),Adventure,Children,,,,,,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death (1995),Action,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,10,GoldenEye (1995),Action,Adventure,Thriller,,,,,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1


In [368]:
# dropping original genre columns
movies_df = movies_df.drop(columns=['genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6', 'genre_7'])

In [371]:
#making year and title columns separate

#making year column
movies_df['year'] = movies_df['title'].str[-6:]

#making real_title column
movies_df['real_title'] = movies_df['title'].str[:-6]

#dropping original title column
movies_df = movies_df.drop(columns="title")

#removing parethensis from year
movies_df['year'] = movies_df['year'].str.replace('[^a-zA-Z0-9 ]', '', regex=True)

#removing whitespaces
movies_df['year'] = movies_df['year'].str.strip()

In [370]:
#checking year values
movies_df['year'].unique()

array(['1995', '1994', '1996', '1976', '1992', '1967', '1993', '1964',
       '1977', '1965', '1982', '1990', '1991', '1989', '1937', '1940',
       '1969', '1981', '1973', '1970', '1955', '1959', '1968', '1988',
       '1997', '1972', '1943', '1952', '1951', '1957', '1961', '1958',
       '1954', '1934', '1944', '1960', '1963', '1942', '1941', '1953',
       '1939', '1950', '1946', '1945', '1938', '1947', '1935', '1936',
       '1956', '1949', '1932', '1975', '1974', '1971', '1979', '1987',
       '1986', '1980', '1978', '1985', '1966', '1962', '1983', '1984',
       '1948', '1933', '1931', '1922', '1998', '1929', '1930', '1927',
       '1928', '1999', '2000', '1926', '1919', '1921', '1925', '1923',
       '2001', '2002', '2003', '1920', '1915', '1924', '2004', '1916',
       '1917', '2005', '2006', '1902', 'ylon 5', '1903', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '1908', 'er One', 'a Road', 'Watson', 'nimals',
    

In [372]:
year_vals = ['er One', 'a Road', 'Watson', 'nimals','terson', 'nlight',
             'The OA', 'Cosmos', 'd Baby', 'Iron 2','Mirror', 'ylon 5']

movies_df = movies_df[~movies_df['year'].isin(year_vals)]

Inspecting links - The additional ids will be helpful so we can pull additional movie information from TMBD's api for the content based model.

In [373]:
links_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


Inspecting tags

In [374]:
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [713]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [710]:
tags_df['movieId'].value_counts()

296     181
2959     54
924      41
293      35
7361     34
       ... 
6107      1
5878      1
5876      1
3192      1
8190      1
Name: movieId, Length: 1572, dtype: int64

In [765]:
#lambda function to aggregate tags
tags_df_group = tags_df.groupby(['movieId']).agg(lambda x: ', '.join(x))

#splitting tage based on ,
tags_df_group = tags_df_group['tag'].str.split(",", expand=True)

#saving only first four tags
tags_df_group = tags_df_group.iloc[:, 0:4]

#renaming columns
tags_df_group = tags_df_group.rename(columns={0:"tag_1", 1:"tag_2", 2:"tag_3", 3:"tag_4"})

#resetting index
tags_df_group.reset_index()

Unnamed: 0,movieId,tag_1,tag_2,tag_3,tag_4
0,1,pixar,pixar,fun,
1,2,fantasy,magic board game,Robin Williams,game
2,3,moldy,old,,
3,5,pregnancy,remake,,
4,7,remake,,,
...,...,...,...,...,...
1567,183611,Comedy,funny,Rachel McAdams,
1568,184471,adventure,Alicia Vikander,video game adaptation,
1569,187593,Josh Brolin,Ryan Reynolds,sarcasm,
1570,187595,Emilia Clarke,star wars,,


### Merging files into one dataframe for cleaning

In [711]:
#merging ratings, movies, and links
final_movies = pd.merge(ratings_df, movies_df, on="movieId", how="left")
final_movies = pd.merge(final_movies, links_df, on="movieId", how="left")
final_movies = pd.merge(final_movies, tags_df_group, on="movieId", how="left")

In [712]:
final_movies

Unnamed: 0,userId,movieId,rating,timestamp,Animation,Children,Romance,Drama,Crime,Adventure,Horror,Comedy,Sci-Fi,War,Thriller,Mystery,Film-Noir,Fantasy,Musical,Western,IMAX,Documentary,Action,year,real_title,imdbId,tmdbId,tag_1,tag_2,tag_3,tag_3.1
0,1,1,4.0,964982703,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1995,Toy Story,114709,862.0,pixar,pixar,fun,
1,1,3,4.0,964981247,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995,Grumpier Old Men,113228,15602.0,moldy,old,,
2,1,6,4.0,964982224,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1995,Heat,113277,949.0,,,,
3,1,47,5.0,964983815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995,Seven (a.k.a. Se7en),114369,807.0,mystery,twist ending,serial killer,
4,1,50,5.0,964982931,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995,"Usual Suspects, The",114814,629.0,mindfuck,suspense,thriller,tricky
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,Split,4972582,381288.0,,,,
100832,610,168248,5.0,1493850091,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017,John Wick: Chapter Two,4425200,324552.0,action,dark hero,gun tactics,hitman
100833,610,168250,5.0,1494273047,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,Get Out,5052448,419430.0,,,,
100834,610,168252,5.0,1493846352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017,Logan,3315342,263115.0,dark,emotional,gritty,heartbreaking


In [716]:
#making column names lowercase
final_movies.columns = final_movies.columns.str.lower()

# Data Preparation

To being, we will need to remove null values and split the genre column so this can be One Hot Encoded.

In [717]:
#saving as new dataframe for cleaning
movie_df = final_movies

In [718]:
#checking for null values
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 31 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userid       100836 non-null  int64  
 1   movieid      100836 non-null  int64  
 2   rating       100836 non-null  float64
 3   timestamp    100836 non-null  int64  
 4   animation    100819 non-null  float64
 5   children     100819 non-null  float64
 6   romance      100819 non-null  float64
 7   drama        100819 non-null  float64
 8   crime        100819 non-null  float64
 9   adventure    100819 non-null  float64
 10  horror       100819 non-null  float64
 11  comedy       100819 non-null  float64
 12  sci-fi       100819 non-null  float64
 13  war          100819 non-null  float64
 14  thriller     100819 non-null  float64
 15  mystery      100819 non-null  float64
 16  film-noir    100819 non-null  float64
 17  fantasy      100819 non-null  float64
 18  musical      100819 non-

In [719]:
#dropping genre columns that are mostly empty
movie_df = movie_df.drop(columns=["timestamp"])

In [720]:
#checking nulls
movie_df.isna().sum()

userid             0
movieid            0
rating             0
animation         17
children          17
romance           17
drama             17
crime             17
adventure         17
horror            17
comedy            17
sci-fi            17
war               17
thriller          17
mystery           17
film-noir         17
fantasy           17
musical           17
western           17
imax              17
documentary       17
action            17
year              17
real_title        17
imdbid             0
tmdbid            13
tag_1          52549
tag_2          76673
tag_3          83355
tag_3          87176
dtype: int64

In [721]:
#making year datetime
movie_df['year'].unique()

array(['1995', '1996', '1994', '1977', '1993', '1990', '1989', '1991',
       '1940', '1939', '1941', '1938', '1947', '1975', '1968', '1945',
       '1963', '1971', '1951', '1979', '1992', '1986', '1982', '1980',
       '1987', '1981', '1983', '1960', '1952', '1984', '1933', '1985',
       '1974', '1922', '1997', '1998', '1930', '1976', '1942', '1967',
       '1959', '1946', '1978', '1973', '1988', '1999', '1931', '1964',
       '1962', '1965', '1969', '2000', '1970', '2003', '2004', '2006',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '1955', '2002', '1961', '1954', '1957', '1943', '1956', '1966',
       '2001', '1937', '1972', '2005', '1950', '2007', '2016', '1926',
       '1944', '1949', '1936', '2017', '1958', '1935', '1927', '1953',
       '2018', '1923', '1902', '1920', '1948', '1928', nan, '1934',
       '1916', '1908', '1932', '1921', '1925', '1929', '1917', '1915',
       '1924', '1903', '1919'], dtype=object)

In [722]:
movie_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df.dropna(inplace=True)


In [723]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13660 entries, 4 to 100834
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   userid       13660 non-null  int64  
 1   movieid      13660 non-null  int64  
 2   rating       13660 non-null  float64
 3   animation    13660 non-null  float64
 4   children     13660 non-null  float64
 5   romance      13660 non-null  float64
 6   drama        13660 non-null  float64
 7   crime        13660 non-null  float64
 8   adventure    13660 non-null  float64
 9   horror       13660 non-null  float64
 10  comedy       13660 non-null  float64
 11  sci-fi       13660 non-null  float64
 12  war          13660 non-null  float64
 13  thriller     13660 non-null  float64
 14  mystery      13660 non-null  float64
 15  film-noir    13660 non-null  float64
 16  fantasy      13660 non-null  float64
 17  musical      13660 non-null  float64
 18  western      13660 non-null  float64
 19  ima

In [724]:
movie_df['year'] = movie_df['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df['year'] = movie_df['year'].astype(int)


In [725]:
movie_df['year'].unique()

array([1995, 1994, 1977, 1993, 1991, 1996, 1992, 1975, 1980, 1981, 1960,
       1984, 1997, 1998, 1999, 1987, 2000, 2006, 2008, 2009, 2010, 2012,
       2013, 2014, 1954, 1957, 2001, 1964, 1968, 2002, 2003, 2004, 2005,
       1990, 2007, 1986, 2016, 1982, 1931, 2015, 2017, 1958, 2011, 2018,
       1967, 1959, 1950, 1988, 1952, 1976, 1985])

In [726]:
#making year datetime
movie_df['year'] = pd.to_datetime(movie_df['year'], format='%Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df['year'] = pd.to_datetime(movie_df['year'], format='%Y')


In [727]:
#confirming null values and datatypes
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13660 entries, 4 to 100834
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   userid       13660 non-null  int64         
 1   movieid      13660 non-null  int64         
 2   rating       13660 non-null  float64       
 3   animation    13660 non-null  float64       
 4   children     13660 non-null  float64       
 5   romance      13660 non-null  float64       
 6   drama        13660 non-null  float64       
 7   crime        13660 non-null  float64       
 8   adventure    13660 non-null  float64       
 9   horror       13660 non-null  float64       
 10  comedy       13660 non-null  float64       
 11  sci-fi       13660 non-null  float64       
 12  war          13660 non-null  float64       
 13  thriller     13660 non-null  float64       
 14  mystery      13660 non-null  float64       
 15  film-noir    13660 non-null  float64       
 16  fan

In [728]:
#renaming columns for modeling

name = ["userid", "movieid", "real_title", "tmdbid", "imdbid"]
new_name = ["user_id", "movie_id", "title","tmdb_id", "imdb_id"]

movie_df = movie_df.rename(columns=dict(zip(name, new_name)))

#### Final Cleaned Dataset (movie_df)
Features are:
- Genre
- Rating
- Tag

In [729]:
#reviewing final cleaned dataset
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13660 entries, 4 to 100834
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      13660 non-null  int64         
 1   movie_id     13660 non-null  int64         
 2   rating       13660 non-null  float64       
 3   animation    13660 non-null  float64       
 4   children     13660 non-null  float64       
 5   romance      13660 non-null  float64       
 6   drama        13660 non-null  float64       
 7   crime        13660 non-null  float64       
 8   adventure    13660 non-null  float64       
 9   horror       13660 non-null  float64       
 10  comedy       13660 non-null  float64       
 11  sci-fi       13660 non-null  float64       
 12  war          13660 non-null  float64       
 13  thriller     13660 non-null  float64       
 14  mystery      13660 non-null  float64       
 15  film-noir    13660 non-null  float64       
 16  fan

In [730]:
#exploring that dataset
movie_df.describe()

Unnamed: 0,user_id,movie_id,rating,animation,children,romance,drama,crime,adventure,horror,comedy,sci-fi,war,thriller,mystery,film-noir,fantasy,musical,western,imax,documentary,action,imdb_id,tmdb_id
count,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0,13660.0
mean,312.53836,21714.679502,3.918302,0.077892,0.081918,0.122035,0.47123,0.272328,0.274597,0.077526,0.259883,0.257906,0.086896,0.334407,0.137994,0.001098,0.11918,0.026354,0.011493,0.075842,0.001171,0.354246,371088.9,15397.337189
std,178.844401,35616.503573,0.951889,0.268011,0.27425,0.327338,0.49919,0.445174,0.446327,0.267433,0.438586,0.437498,0.281693,0.4718,0.344906,0.033121,0.324012,0.160193,0.106593,0.264755,0.034206,0.478302,547956.1,47475.896359
min,1.0,2.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22100.0,11.0
25%,159.0,593.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109830.0,274.0
50%,313.0,2762.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120903.0,680.0
75%,465.0,35836.0,4.5,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,364569.0,8587.0
max,610.0,193565.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4425200.0,374430.0


## Data Source #2 - TMDB API

The API pull includes credit (crew and cast) information for all movies.

Documentation - https://developers.themoviedb.org/3/movies/get-movie-credits

In [None]:
# original api pull
# import requests
# import pandas as pd

## this is a list of all movie id's from the original dataframe
# movie_list_final = []

# for id_num in tmbd_list:
    
#     response = requests.get(''f"https://api.themoviedb.org/3/movie/{id_num}/credits?api_key=a88e9a2c082e4caa0a1447a81f7e743b&language=en-US"'')

#     data = response.json()
    
#     #appending director name to list
#     movie_list_final.append(data)

In [739]:
import zipfile
 
# open zipped dataset
with zipfile.ZipFile("data/tmdb_pull_cast_crew.csv.zip") as z:
   # open the csv file in the dataset
   with z.open("tmdb_pull_cast_crew.csv") as f:
       
      # read the dataset
      api_df = pd.read_csv(f)

In [740]:
api_df

Unnamed: 0.1,Unnamed: 0,id,cast,crew,success,status_code,status_message
0,0,13.0,"[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 37, 'know...",,,
1,1,278.0,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...",,,
2,2,680.0,"[{'adult': False, 'gender': 2, 'id': 8891, 'kn...","[{'adult': False, 'gender': 2, 'id': 138, 'kno...",,,
3,3,274.0,"[{'adult': False, 'gender': 1, 'id': 1038, 'kn...","[{'adult': False, 'gender': 2, 'id': 117, 'kno...",,,
4,4,603.0,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn...","[{'adult': False, 'gender': 2, 'id': 123, 'kno...",,,
...,...,...,...,...,...,...,...
9710,9710,10846.0,"[{'adult': False, 'gender': 2, 'id': 4818, 'kn...","[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",,,
9711,9711,206296.0,"[{'adult': False, 'gender': 1, 'id': 84223, 'k...","[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",,,
9712,9712,244688.0,"[{'adult': False, 'gender': 2, 'id': 1279972, ...","[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",,,
9713,9713,47533.0,"[{'adult': False, 'gender': 1, 'id': 89110, 'k...","[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",,,


In [741]:
#inspecting keys from api pill
api_df.keys()

Index(['Unnamed: 0', 'id', 'cast', 'crew', 'success', 'status_code',
       'status_message'],
      dtype='object')

In [742]:
#looking at crew
api_df.iloc[0]['crew']

'[{\'adult\': False, \'gender\': 2, \'id\': 37, \'known_for_department\': \'Sound\', \'name\': \'Alan Silvestri\', \'original_name\': \'Alan Silvestri\', \'popularity\': 2.829, \'profile_path\': \'/pQOAsQDuYMR4cKaPAP0wkRlCSNo.jpg\', \'credit_id\': \'52fe420ec3a36847f800076b\', \'department\': \'Sound\', \'job\': \'Original Music Composer\'}, {\'adult\': False, \'gender\': 2, \'id\': 37, \'known_for_department\': \'Sound\', \'name\': \'Alan Silvestri\', \'original_name\': \'Alan Silvestri\', \'popularity\': 2.829, \'profile_path\': \'/pQOAsQDuYMR4cKaPAP0wkRlCSNo.jpg\', \'credit_id\': \'5cc623de9251410961f44e23\', \'department\': \'Sound\', \'job\': \'Conductor\'}, {\'adult\': False, \'gender\': 2, \'id\': 24, \'known_for_department\': \'Directing\', \'name\': \'Robert Zemeckis\', \'original_name\': \'Robert Zemeckis\', \'popularity\': 11.27, \'profile_path\': \'/lPYDQ5LYNJ12rJZENtyASmVZ1Ql.jpg\', \'credit_id\': \'52fe420ec3a36847f800072d\', \'department\': \'Directing\', \'job\': \'Dire

In [743]:
#confirming crew type
type(api_df.iloc[0]['crew'])

str

### Breaking out crew list to find director

In [744]:
#dropping null values from crew
api_df = api_df.dropna(subset=["crew"])

In [745]:
#defining function to turn crew from a list to a string

def list_dict(row):
    return list(eval(row))

In [746]:
#lambda function to make list of dictionaries column
api_df['list_dict'] = api_df['crew'].apply(lambda x: list_dict(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  api_df['list_dict'] = api_df['crew'].apply(lambda x: list_dict(x))


In [747]:
def find_director(row):

    for entry in row:
        if entry['job'] == 'Director':
            name = entry['name']
            return name

In [748]:
#making a lambda function
api_df['director'] = api_df['list_dict'].apply(lambda x: find_director(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  api_df['director'] = api_df['list_dict'].apply(lambda x: find_director(x))


In [749]:
api_df

Unnamed: 0.1,Unnamed: 0,id,cast,crew,success,status_code,status_message,list_dict,director
0,0,13.0,"[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 37, 'know...",,,,"[{'adult': False, 'gender': 2, 'id': 37, 'know...",Robert Zemeckis
1,1,278.0,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 153, 'kno...",Frank Darabont
2,2,680.0,"[{'adult': False, 'gender': 2, 'id': 8891, 'kn...","[{'adult': False, 'gender': 2, 'id': 138, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 138, 'kno...",Quentin Tarantino
3,3,274.0,"[{'adult': False, 'gender': 1, 'id': 1038, 'kn...","[{'adult': False, 'gender': 2, 'id': 117, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 117, 'kno...",Jonathan Demme
4,4,603.0,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn...","[{'adult': False, 'gender': 2, 'id': 123, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 123, 'kno...",Lilly Wachowski
...,...,...,...,...,...,...,...,...,...
9710,9710,10846.0,"[{'adult': False, 'gender': 2, 'id': 4818, 'kn...","[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",,,,"[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",Roberto Benigni
9711,9711,206296.0,"[{'adult': False, 'gender': 1, 'id': 84223, 'k...","[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",Richard LaGravenese
9712,9712,244688.0,"[{'adult': False, 'gender': 2, 'id': 1279972, ...","[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",Josh Stolberg
9713,9713,47533.0,"[{'adult': False, 'gender': 1, 'id': 89110, 'k...","[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",Brian Levant


In [750]:
#dropping null values
api_df = api_df.dropna(subset=['director'])

### Breaking out cast list to find actors

In [751]:
#dropping null values from crew
api_df = api_df.dropna(subset=["cast"])

In [752]:
#lambda function to make list of dictionaries column from cast
api_df['cast_dict'] = api_df['cast'].apply(lambda x: list_dict(x))

In [753]:
api_df

Unnamed: 0.1,Unnamed: 0,id,cast,crew,success,status_code,status_message,list_dict,director,cast_dict
0,0,13.0,"[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 37, 'know...",,,,"[{'adult': False, 'gender': 2, 'id': 37, 'know...",Robert Zemeckis,"[{'adult': False, 'gender': 2, 'id': 31, 'know..."
1,1,278.0,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 153, 'kno...",Frank Darabont,"[{'adult': False, 'gender': 2, 'id': 504, 'kno..."
2,2,680.0,"[{'adult': False, 'gender': 2, 'id': 8891, 'kn...","[{'adult': False, 'gender': 2, 'id': 138, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 138, 'kno...",Quentin Tarantino,"[{'adult': False, 'gender': 2, 'id': 8891, 'kn..."
3,3,274.0,"[{'adult': False, 'gender': 1, 'id': 1038, 'kn...","[{'adult': False, 'gender': 2, 'id': 117, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 117, 'kno...",Jonathan Demme,"[{'adult': False, 'gender': 1, 'id': 1038, 'kn..."
4,4,603.0,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn...","[{'adult': False, 'gender': 2, 'id': 123, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 123, 'kno...",Lilly Wachowski,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn..."
...,...,...,...,...,...,...,...,...,...,...
9710,9710,10846.0,"[{'adult': False, 'gender': 2, 'id': 4818, 'kn...","[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",,,,"[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",Roberto Benigni,"[{'adult': False, 'gender': 2, 'id': 4818, 'kn..."
9711,9711,206296.0,"[{'adult': False, 'gender': 1, 'id': 84223, 'k...","[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",Richard LaGravenese,"[{'adult': False, 'gender': 1, 'id': 84223, 'k..."
9712,9712,244688.0,"[{'adult': False, 'gender': 2, 'id': 1279972, ...","[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",Josh Stolberg,"[{'adult': False, 'gender': 2, 'id': 1279972, ..."
9713,9713,47533.0,"[{'adult': False, 'gender': 1, 'id': 89110, 'k...","[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",Brian Levant,"[{'adult': False, 'gender': 1, 'id': 89110, 'k..."


In [754]:
api_df['cast_dict'][0]

[{'adult': False,
  'gender': 2,
  'id': 31,
  'known_for_department': 'Acting',
  'name': 'Tom Hanks',
  'original_name': 'Tom Hanks',
  'popularity': 71.995,
  'profile_path': '/xndWFsBlClOJFRdhSt4NBwiPq2o.jpg',
  'cast_id': 7,
  'character': 'Forrest Gump',
  'credit_id': '52fe420ec3a36847f800074f',
  'order': 0},
 {'adult': False,
  'gender': 1,
  'id': 32,
  'known_for_department': 'Acting',
  'name': 'Robin Wright',
  'original_name': 'Robin Wright',
  'popularity': 22.708,
  'profile_path': '/d3rIv0y2p0jMsQ7ViR7O1606NZa.jpg',
  'cast_id': 8,
  'character': 'Jenny Curran',
  'credit_id': '52fe420ec3a36847f8000753',
  'order': 1},
 {'adult': False,
  'gender': 2,
  'id': 33,
  'known_for_department': 'Acting',
  'name': 'Gary Sinise',
  'original_name': 'Gary Sinise',
  'popularity': 18.864,
  'profile_path': '/ngYV91xYfCu0JNcSxJ4yQ7tzOna.jpg',
  'cast_id': 9,
  'character': 'Lieutenant Dan Taylor',
  'credit_id': '52fe420ec3a36847f8000757',
  'order': 2},
 {'adult': False,
  'gen

In [605]:
def find_actor(row):

    for entry in row:
        if entry['known_for_department'] == 'Acting':
            name = entry['name']
            return name

In [755]:
#making a lambda function
api_df['main_actor'] = api_df['cast_dict'].apply(lambda x: find_actor(x))

In [756]:
api_df

Unnamed: 0.1,Unnamed: 0,id,cast,crew,success,status_code,status_message,list_dict,director,cast_dict,main_actor
0,0,13.0,"[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 37, 'know...",,,,"[{'adult': False, 'gender': 2, 'id': 37, 'know...",Robert Zemeckis,"[{'adult': False, 'gender': 2, 'id': 31, 'know...",Tom Hanks
1,1,278.0,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 153, 'kno...",Frank Darabont,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...",Tim Robbins
2,2,680.0,"[{'adult': False, 'gender': 2, 'id': 8891, 'kn...","[{'adult': False, 'gender': 2, 'id': 138, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 138, 'kno...",Quentin Tarantino,"[{'adult': False, 'gender': 2, 'id': 8891, 'kn...",John Travolta
3,3,274.0,"[{'adult': False, 'gender': 1, 'id': 1038, 'kn...","[{'adult': False, 'gender': 2, 'id': 117, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 117, 'kno...",Jonathan Demme,"[{'adult': False, 'gender': 1, 'id': 1038, 'kn...",Jodie Foster
4,4,603.0,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn...","[{'adult': False, 'gender': 2, 'id': 123, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 123, 'kno...",Lilly Wachowski,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn...",Keanu Reeves
...,...,...,...,...,...,...,...,...,...,...,...
9710,9710,10846.0,"[{'adult': False, 'gender': 2, 'id': 4818, 'kn...","[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",,,,"[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",Roberto Benigni,"[{'adult': False, 'gender': 2, 'id': 4818, 'kn...",Roberto Benigni
9711,9711,206296.0,"[{'adult': False, 'gender': 1, 'id': 84223, 'k...","[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",Richard LaGravenese,"[{'adult': False, 'gender': 1, 'id': 84223, 'k...",Anna Kendrick
9712,9712,244688.0,"[{'adult': False, 'gender': 2, 'id': 1279972, ...","[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",Josh Stolberg,"[{'adult': False, 'gender': 2, 'id': 1279972, ...",Sam Pancake
9713,9713,47533.0,"[{'adult': False, 'gender': 1, 'id': 89110, 'k...","[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",Brian Levant,"[{'adult': False, 'gender': 1, 'id': 89110, 'k...",Kate Melton


## Modeling Dataframe

Creating the final dataframe for modeling **content_df**

In [768]:
content_df = api_df.copy()

In [769]:
content_df

Unnamed: 0.1,Unnamed: 0,id,cast,crew,success,status_code,status_message,list_dict,director,cast_dict,main_actor
0,0,13.0,"[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 37, 'know...",,,,"[{'adult': False, 'gender': 2, 'id': 37, 'know...",Robert Zemeckis,"[{'adult': False, 'gender': 2, 'id': 31, 'know...",Tom Hanks
1,1,278.0,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 153, 'kno...",Frank Darabont,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...",Tim Robbins
2,2,680.0,"[{'adult': False, 'gender': 2, 'id': 8891, 'kn...","[{'adult': False, 'gender': 2, 'id': 138, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 138, 'kno...",Quentin Tarantino,"[{'adult': False, 'gender': 2, 'id': 8891, 'kn...",John Travolta
3,3,274.0,"[{'adult': False, 'gender': 1, 'id': 1038, 'kn...","[{'adult': False, 'gender': 2, 'id': 117, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 117, 'kno...",Jonathan Demme,"[{'adult': False, 'gender': 1, 'id': 1038, 'kn...",Jodie Foster
4,4,603.0,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn...","[{'adult': False, 'gender': 2, 'id': 123, 'kno...",,,,"[{'adult': False, 'gender': 2, 'id': 123, 'kno...",Lilly Wachowski,"[{'adult': False, 'gender': 2, 'id': 6384, 'kn...",Keanu Reeves
...,...,...,...,...,...,...,...,...,...,...,...
9710,9710,10846.0,"[{'adult': False, 'gender': 2, 'id': 4818, 'kn...","[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",,,,"[{'adult': False, 'gender': 1, 'id': 2186, 'kn...",Roberto Benigni,"[{'adult': False, 'gender': 2, 'id': 4818, 'kn...",Roberto Benigni
9711,9711,206296.0,"[{'adult': False, 'gender': 1, 'id': 84223, 'k...","[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 2163, 'kn...",Richard LaGravenese,"[{'adult': False, 'gender': 1, 'id': 84223, 'k...",Anna Kendrick
9712,9712,244688.0,"[{'adult': False, 'gender': 2, 'id': 1279972, ...","[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 6213, 'kn...",Josh Stolberg,"[{'adult': False, 'gender': 2, 'id': 1279972, ...",Sam Pancake
9713,9713,47533.0,"[{'adult': False, 'gender': 1, 'id': 89110, 'k...","[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",,,,"[{'adult': False, 'gender': 2, 'id': 1060, 'kn...",Brian Levant,"[{'adult': False, 'gender': 1, 'id': 89110, 'k...",Kate Melton


In [770]:
#dropping unneeded columns
content_df = content_df.drop(columns=["success", "status_code", "status_message",
                         "list_dict", "cast_dict", "cast", "crew", "Unnamed: 0"])

In [771]:
#renaming column for merging
content_df = content_df.rename(columns={"id": "tmdb_id"})

In [772]:
#checking columns
content_df.head()

Unnamed: 0,tmdb_id,director,main_actor
0,13.0,Robert Zemeckis,Tom Hanks
1,278.0,Frank Darabont,Tim Robbins
2,680.0,Quentin Tarantino,John Travolta
3,274.0,Jonathan Demme,Jodie Foster
4,603.0,Lilly Wachowski,Keanu Reeves


In [773]:
#merging api df with the final cleaned movie_df dataframe for final modeling
content_df = pd.merge(movie_df, content_df, on="tmdb_id", how="left")

In [774]:
content_df

Unnamed: 0,user_id,movie_id,rating,animation,children,romance,drama,crime,adventure,horror,comedy,sci-fi,war,thriller,mystery,film-noir,fantasy,musical,western,imax,documentary,action,year,title,imdb_id,tmdb_id,tag_1,tag_2,tag_3,tag_3.1,director,main_actor
0,1,50,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995-01-01,"Usual Suspects, The",114814,629.0,mindfuck,suspense,thriller,tricky,Bryan Singer,Gabriel Byrne
1,1,110,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1995-01-01,Braveheart,112573,197.0,beautiful scenery,epic,historical,inspirational,Mel Gibson,Mel Gibson
2,1,223,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994-01-01,Clerks,109445,2292.0,cynical,hilarious,independent film,quirky,Kevin Smith,Brian O'Halloran
3,1,260,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1977-01-01,Star Wars: Episode IV - A New Hope,76759,11.0,classic,space action,action,sci-fi,George Lucas,Mark Hamill
4,1,296,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994-01-01,Pulp Fiction,110912,680.0,good dialogue,great soundtrack,non-linear,cult film,Quentin Tarantino,John Travolta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13655,610,139385,4.5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-01-01,The Revenant,1663202,281957.0,leonardo DiCarpio,survival,tom hardy,visually appealing,Alejandro González Iñárritu,Leonardo DiCaprio
13656,610,158872,3.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-01-01,Sausage Party,1700841,223702.0,Crude humor,mindfuck,sarcasm,satire,Conrad Vernon,Seth Rogen
13657,610,164179,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-01-01,Arrival,2543164,329865.0,beautiful visuals,Cerebral,cinematography,good cinematography,Denis Villeneuve,Amy Adams
13658,610,168248,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017-01-01,John Wick: Chapter Two,4425200,324552.0,action,dark hero,gun tactics,hitman,Chad Stahelski,Keanu Reeves


In [776]:
content_df = content_df.dropna()

In [777]:
content_df

Unnamed: 0,user_id,movie_id,rating,animation,children,romance,drama,crime,adventure,horror,comedy,sci-fi,war,thriller,mystery,film-noir,fantasy,musical,western,imax,documentary,action,year,title,imdb_id,tmdb_id,tag_1,tag_2,tag_3,tag_3.1,director,main_actor
0,1,50,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995-01-01,"Usual Suspects, The",114814,629.0,mindfuck,suspense,thriller,tricky,Bryan Singer,Gabriel Byrne
1,1,110,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1995-01-01,Braveheart,112573,197.0,beautiful scenery,epic,historical,inspirational,Mel Gibson,Mel Gibson
2,1,223,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994-01-01,Clerks,109445,2292.0,cynical,hilarious,independent film,quirky,Kevin Smith,Brian O'Halloran
3,1,260,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1977-01-01,Star Wars: Episode IV - A New Hope,76759,11.0,classic,space action,action,sci-fi,George Lucas,Mark Hamill
4,1,296,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994-01-01,Pulp Fiction,110912,680.0,good dialogue,great soundtrack,non-linear,cult film,Quentin Tarantino,John Travolta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13655,610,139385,4.5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-01-01,The Revenant,1663202,281957.0,leonardo DiCarpio,survival,tom hardy,visually appealing,Alejandro González Iñárritu,Leonardo DiCaprio
13656,610,158872,3.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-01-01,Sausage Party,1700841,223702.0,Crude humor,mindfuck,sarcasm,satire,Conrad Vernon,Seth Rogen
13657,610,164179,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-01-01,Arrival,2543164,329865.0,beautiful visuals,Cerebral,cinematography,good cinematography,Denis Villeneuve,Amy Adams
13658,610,168248,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017-01-01,John Wick: Chapter Two,4425200,324552.0,action,dark hero,gun tactics,hitman,Chad Stahelski,Keanu Reeves
