# Content Based Recommendation Model

By: Anat Jacobson and Stephanie Ciaccia

---

## Overview

Building a content based recommendation system

# Business Problem

Add business problem here

# Data Understanding

In [34]:
#importing packages

In [35]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
import datetime

import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
%matplotlib inline
import plotly.express as px
from matplotlib.ticker import StrMethodFormatter

from collections import Counter
from nltk.corpus import stopwords

from IPython.display import display

Function for printing long lists

In [36]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

Setting display option to view all columns

In [None]:
pd.set_option('display.max_columns', None)

### Data Source : Grouplens

The data used in our analysis was obtained from [Movie Lens]([http://movielens.org]), which consists of user ratings on movies using a 5-star scale. The dataset was collected between March 29, 1996, and September 24, 2018, and includes ratings from 610 users. In total, the dataset contains 100836 ratings and 3683 tags across 9742 movies.

Files from the dataset inclue `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`.

### Import CSVs, Merge, and Clean all Data Needed
#### 4 Datasets imported:

- Movies:
- Links:
- Ratings:
- Tags:

In [355]:
#importing datasets
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv")
links_df = pd.read_csv("data/links.csv")
tags_df = pd.read_csv("data/tags.csv")

In [356]:
#checking the shape of all dfs
print('movies_df: ', movies_df.shape)
print('links_df: ',links_df.shape)
print('ratings_df: ',ratings_df.shape)
print('tags_df: ',tags_df.shape)

movies_df:  (9742, 3)
links_df:  (9742, 3)
ratings_df:  (100836, 4)
tags_df:  (3683, 4)


In [357]:
tag_group_df = tags_df.groupby(by="movieId").agg({"tag":" ".join})

In [358]:
#we would want to run a vectorizer and clean data
tag_group_df.value_counts()

tag                                                               
In Netflix queue                                                      109
Disney                                                                 15
aliens                                                                 10
Shakespeare                                                            10
Stephen King                                                            9
                                                                     ... 
death penalty Nun                                                       1
death penalty John Grisham                                              1
deafness                                                                1
darth vader luke skywalker space opera                                  1
"artsy" atmospheric gritty hallucinatory surreal visually stunning      1
Length: 1038, dtype: int64

In [359]:
#splitting genre into multiple columns
genre_split = pd.DataFrame(movies_df.genres.str.split('|',6).tolist(),
                                 columns = ['genre_1','genre_2', 'genre_3',
                                            'genre_4', 'genre_5', 'genre_6', 'genre_7'])

#merging dataframes
movies_df = pd.merge(pd.DataFrame(movies_df), pd.DataFrame(genre_split), left_index=True, right_index=True)

#dropping genre columns
movies_df.drop(columns="genres", inplace=True)

In [360]:
#looking at unique genre names
genre_split['genre_2'].unique()

array(['Animation', 'Children', 'Romance', 'Drama', None, 'Crime',
       'Adventure', 'Horror', 'Comedy', 'Sci-Fi', 'War', 'Thriller',
       'Mystery', 'Film-Noir', 'Fantasy', 'Musical', 'Western', 'IMAX',
       'Documentary'], dtype=object)

In [361]:
#finding unique genres and saving to list to one hot encode genres
genre_list = genre_split['genre_2'].unique().tolist()

#appending additional genres not in this column
genre_list.append('Fantasy')
genre_list.append('Action')

#removing none value in list
genre_list.remove(None)

In [362]:
genre_list

['Animation',
 'Children',
 'Romance',
 'Drama',
 'Crime',
 'Adventure',
 'Horror',
 'Comedy',
 'Sci-Fi',
 'War',
 'Thriller',
 'Mystery',
 'Film-Noir',
 'Fantasy',
 'Musical',
 'Western',
 'IMAX',
 'Documentary',
 'Fantasy',
 'Action']

In [363]:
# looping over genres in genre_list
for genre in genre_list:
    # create new columns with genre names in genre_list
    new_col = genre
    
    # make new column with 0 for all values (row will be made up of bool values)
    movies_df[new_col] = 0
    
    # looping over columns and rows and check if value is equal to genre in list - if it's equal we add value to 1
    for col in movies_df.columns:
        for i in range(len(movies_df)):
            if movies_df[col][i] == genre:
                movies_df[new_col][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df[new_col][i] = 1


In [364]:
#sanity check of for loop to make sure it worked correctly
movies_df.head(10)

Unnamed: 0,movieId,title,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,Animation,Children,Romance,Drama,Crime,Adventure,Horror,Comedy,Sci-Fi,War,Thriller,Mystery,Film-Noir,Fantasy,Musical,Western,IMAX,Documentary,Action
0,1,Toy Story (1995),Adventure,Animation,Children,Comedy,Fantasy,,,1,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
1,2,Jumanji (1995),Adventure,Children,Fantasy,,,,,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy,Romance,,,,,,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy,Drama,Romance,,,,,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,,,,,,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,6,Heat (1995),Action,Crime,Thriller,,,,,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
6,7,Sabrina (1995),Comedy,Romance,,,,,,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,8,Tom and Huck (1995),Adventure,Children,,,,,,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death (1995),Action,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,10,GoldenEye (1995),Action,Adventure,Thriller,,,,,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1


In [368]:
# dropping original genre columns
movies_df = movies_df.drop(columns=['genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6', 'genre_7'])

In [371]:
#making year and title columns separate

#making year column
movies_df['year'] = movies_df['title'].str[-6:]

#making real_title column
movies_df['real_title'] = movies_df['title'].str[:-6]

#dropping original title column
movies_df = movies_df.drop(columns="title")

#removing parethensis from year
movies_df['year'] = movies_df['year'].str.replace('[^a-zA-Z0-9 ]', '', regex=True)

#removing whitespaces
movies_df['year'] = movies_df['year'].str.strip()

In [370]:
#checking year values
movies_df['year'].unique()

array(['1995', '1994', '1996', '1976', '1992', '1967', '1993', '1964',
       '1977', '1965', '1982', '1990', '1991', '1989', '1937', '1940',
       '1969', '1981', '1973', '1970', '1955', '1959', '1968', '1988',
       '1997', '1972', '1943', '1952', '1951', '1957', '1961', '1958',
       '1954', '1934', '1944', '1960', '1963', '1942', '1941', '1953',
       '1939', '1950', '1946', '1945', '1938', '1947', '1935', '1936',
       '1956', '1949', '1932', '1975', '1974', '1971', '1979', '1987',
       '1986', '1980', '1978', '1985', '1966', '1962', '1983', '1984',
       '1948', '1933', '1931', '1922', '1998', '1929', '1930', '1927',
       '1928', '1999', '2000', '1926', '1919', '1921', '1925', '1923',
       '2001', '2002', '2003', '1920', '1915', '1924', '2004', '1916',
       '1917', '2005', '2006', '1902', 'ylon 5', '1903', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '1908', 'er One', 'a Road', 'Watson', 'nimals',
    

In [372]:
year_vals = ['er One', 'a Road', 'Watson', 'nimals','terson', 'nlight',
             'The OA', 'Cosmos', 'd Baby', 'Iron 2','Mirror', 'ylon 5']

movies_df = movies_df[~movies_df['year'].isin(year_vals)]

Inspecting links - The additional ids will be helpful so we can pull additional movie information from TMBD's api for the content based model.

In [373]:
links_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


Inspecting tags

In [374]:
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


### Merging files into one dataframe for cleaning

In [375]:
#merging ratings, movies, and links
final_movies = pd.merge(ratings_df, movies_df, on="movieId", how="left")
final_movies = pd.merge(final_movies, links_df, on="movieId", how="left")

In [376]:
final_movies

Unnamed: 0,userId,movieId,rating,timestamp,Animation,Children,Romance,Drama,Crime,Adventure,Horror,Comedy,Sci-Fi,War,Thriller,Mystery,Film-Noir,Fantasy,Musical,Western,IMAX,Documentary,Action,year,real_title,imdbId,tmdbId
0,1,1,4.0,964982703,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1995,Toy Story,114709,862.0
1,1,3,4.0,964981247,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995,Grumpier Old Men,113228,15602.0
2,1,6,4.0,964982224,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1995,Heat,113277,949.0
3,1,47,5.0,964983815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995,Seven (a.k.a. Se7en),114369,807.0
4,1,50,5.0,964982931,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995,"Usual Suspects, The",114814,629.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,Split,4972582,381288.0
100832,610,168248,5.0,1493850091,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017,John Wick: Chapter Two,4425200,324552.0
100833,610,168250,5.0,1494273047,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017,Get Out,5052448,419430.0
100834,610,168252,5.0,1493846352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2017,Logan,3315342,263115.0


In [377]:
#making column names lowercase
final_movies.columns = final_movies.columns.str.lower()

# Data Preparation

To being, we will need to remove null values and split the genre column so this can be One Hot Encoded.

In [378]:
#saving as new dataframe for cleaning
movie_df = final_movies

In [379]:
#checking for null values
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 27 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userid       100836 non-null  int64  
 1   movieid      100836 non-null  int64  
 2   rating       100836 non-null  float64
 3   timestamp    100836 non-null  int64  
 4   animation    100819 non-null  float64
 5   children     100819 non-null  float64
 6   romance      100819 non-null  float64
 7   drama        100819 non-null  float64
 8   crime        100819 non-null  float64
 9   adventure    100819 non-null  float64
 10  horror       100819 non-null  float64
 11  comedy       100819 non-null  float64
 12  sci-fi       100819 non-null  float64
 13  war          100819 non-null  float64
 14  thriller     100819 non-null  float64
 15  mystery      100819 non-null  float64
 16  film-noir    100819 non-null  float64
 17  fantasy      100819 non-null  float64
 18  musical      100819 non-

In [380]:
#dropping genre columns that are mostly empty
movie_df = movie_df.drop(columns=["timestamp"])

In [381]:
#checking nulls
movie_df.isna().sum()

userid          0
movieid         0
rating          0
animation      17
children       17
romance        17
drama          17
crime          17
adventure      17
horror         17
comedy         17
sci-fi         17
war            17
thriller       17
mystery        17
film-noir      17
fantasy        17
musical        17
western        17
imax           17
documentary    17
action         17
year           17
real_title     17
imdbid          0
tmdbid         13
dtype: int64

In [382]:
#making year datetime
movie_df['year'].unique()

array(['1995', '1996', '1994', '1977', '1993', '1990', '1989', '1991',
       '1940', '1939', '1941', '1938', '1947', '1975', '1968', '1945',
       '1963', '1971', '1951', '1979', '1992', '1986', '1982', '1980',
       '1987', '1981', '1983', '1960', '1952', '1984', '1933', '1985',
       '1974', '1922', '1997', '1998', '1930', '1976', '1942', '1967',
       '1959', '1946', '1978', '1973', '1988', '1999', '1931', '1964',
       '1962', '1965', '1969', '2000', '1970', '2003', '2004', '2006',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '1955', '2002', '1961', '1954', '1957', '1943', '1956', '1966',
       '2001', '1937', '1972', '2005', '1950', '2007', '2016', '1926',
       '1944', '1949', '1936', '2017', '1958', '1935', '1927', '1953',
       '2018', '1923', '1902', '1920', '1948', '1928', nan, '1934',
       '1916', '1908', '1932', '1921', '1925', '1929', '1917', '1915',
       '1924', '1903', '1919'], dtype=object)

In [383]:
movie_df.dropna(inplace=True)

In [384]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100808 entries, 0 to 100835
Data columns (total 26 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userid       100808 non-null  int64  
 1   movieid      100808 non-null  int64  
 2   rating       100808 non-null  float64
 3   animation    100808 non-null  float64
 4   children     100808 non-null  float64
 5   romance      100808 non-null  float64
 6   drama        100808 non-null  float64
 7   crime        100808 non-null  float64
 8   adventure    100808 non-null  float64
 9   horror       100808 non-null  float64
 10  comedy       100808 non-null  float64
 11  sci-fi       100808 non-null  float64
 12  war          100808 non-null  float64
 13  thriller     100808 non-null  float64
 14  mystery      100808 non-null  float64
 15  film-noir    100808 non-null  float64
 16  fantasy      100808 non-null  float64
 17  musical      100808 non-null  float64
 18  western      100808 non-

In [385]:
movie_df['year'] = movie_df['year'].astype(int)

In [386]:
movie_df['year'].unique()

array([1995, 1996, 1994, 1977, 1993, 1990, 1989, 1991, 1940, 1939, 1941,
       1938, 1947, 1975, 1968, 1945, 1963, 1971, 1951, 1979, 1992, 1986,
       1982, 1980, 1987, 1981, 1983, 1960, 1952, 1984, 1933, 1985, 1974,
       1922, 1997, 1998, 1930, 1976, 1942, 1967, 1959, 1946, 1978, 1973,
       1988, 1999, 1931, 1964, 1962, 1965, 1969, 2000, 1970, 2003, 2004,
       2006, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 1955, 2002,
       1961, 1954, 1957, 1943, 1956, 1966, 2001, 1937, 1972, 2005, 1950,
       2007, 2016, 1926, 1944, 1949, 1936, 2017, 1958, 1935, 1927, 1953,
       2018, 1923, 1902, 1920, 1948, 1928, 1934, 1916, 1908, 1932, 1921,
       1925, 1929, 1917, 1915, 1924, 1903, 1919])

In [387]:
#making year datetime
movie_df['year'] = pd.to_datetime(movie_df['year'], format='%Y')

In [388]:
#confirming null values and datatypes
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100808 entries, 0 to 100835
Data columns (total 26 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   userid       100808 non-null  int64         
 1   movieid      100808 non-null  int64         
 2   rating       100808 non-null  float64       
 3   animation    100808 non-null  float64       
 4   children     100808 non-null  float64       
 5   romance      100808 non-null  float64       
 6   drama        100808 non-null  float64       
 7   crime        100808 non-null  float64       
 8   adventure    100808 non-null  float64       
 9   horror       100808 non-null  float64       
 10  comedy       100808 non-null  float64       
 11  sci-fi       100808 non-null  float64       
 12  war          100808 non-null  float64       
 13  thriller     100808 non-null  float64       
 14  mystery      100808 non-null  float64       
 15  film-noir    100808 non-null  floa

In [391]:
#renaming columns for modeling

name = ["userid", "movieid", "real_title", "tmdbid", "imdbid"]
new_name = ["user_id", "movie_id", "title","tmdb_id", "imdb_id"]

movie_df = movie_df.rename(columns=dict(zip(name, new_name)))

#### Final Cleaned Dataset (movie_df)
Features are:
- Genre
- Rating
- Tag

In [393]:
#reviewing final cleaned dataset
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100808 entries, 0 to 100835
Data columns (total 26 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   user_id      100808 non-null  int64         
 1   movie_id     100808 non-null  int64         
 2   rating       100808 non-null  float64       
 3   animation    100808 non-null  float64       
 4   children     100808 non-null  float64       
 5   romance      100808 non-null  float64       
 6   drama        100808 non-null  float64       
 7   crime        100808 non-null  float64       
 8   adventure    100808 non-null  float64       
 9   horror       100808 non-null  float64       
 10  comedy       100808 non-null  float64       
 11  sci-fi       100808 non-null  float64       
 12  war          100808 non-null  float64       
 13  thriller     100808 non-null  float64       
 14  mystery      100808 non-null  float64       
 15  film-noir    100808 non-null  floa

In [394]:
#exploring that dataset
movie_df.describe()

Unnamed: 0,user_id,movie_id,rating,animation,children,romance,drama,crime,adventure,horror,comedy,sci-fi,war,thriller,mystery,film-noir,fantasy,musical,western,imax,documentary,action,imdb_id,tmdb_id
count,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0,100808.0
mean,326.130634,19414.983126,3.501602,0.06932,0.091342,0.179777,0.41588,0.165453,0.239634,0.072296,0.38737,0.170929,0.048201,0.262281,0.076095,0.00863,0.117391,0.041048,0.019125,0.041118,0.012082,0.303855,351170.7,20054.15716
std,182.619878,35494.958184,1.042409,0.253999,0.288096,0.384004,0.492875,0.371591,0.426862,0.258978,0.487152,0.376449,0.214191,0.439877,0.265152,0.092498,0.321888,0.198403,0.136967,0.198564,0.109254,0.459923,620728.9,53102.944594
min,1.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,417.0,2.0
25%,177.0,1199.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99685.0,712.0
50%,325.0,2991.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118763.5,6957.0
75%,477.0,8044.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,314979.0,11635.0
max,610.0,193609.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8391976.0,525662.0


## Data Source #2 - TMDB API

The API pull includes credit (crew and cast) information for all movies.

Documentation - https://developers.themoviedb.org/3/movies/get-movie-credits

In [None]:
# original api pull
# import requests
# import pandas as pd

## this is a list of all movie id's from the original dataframe
# movie_list_final = []

# for id_num in tmbd_list:
    
#     response = requests.get(''f"https://api.themoviedb.org/3/movie/{id_num}/credits?api_key=a88e9a2c082e4caa0a1447a81f7e743b&language=en-US"'')

#     data = response.json()
    
#     #appending director name to list
#     movie_list_final.append(data)

In [412]:
#importing api pull with additional data

api_df = pd.read_csv("data/tmdb_pull_cast_crew.csv")

In [420]:
#inspecting keys from api pill
api_df.keys()

Index(['Unnamed: 0', 'id', 'cast', 'crew', 'success', 'status_code',
       'status_message'],
      dtype='object')

In [427]:
data = api_df['crew']

In [431]:
test = pd.DataFrame.from_dict(data)

In [433]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9715 entries, 0 to 9714
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   crew    9609 non-null   object
dtypes: object(1)
memory usage: 76.0+ KB


In [436]:
# we need to figure out how to break this out based on key
test

Unnamed: 0,crew
0,"[{'adult': False, 'gender': 2, 'id': 37, 'know..."
1,"[{'adult': False, 'gender': 2, 'id': 153, 'kno..."
2,"[{'adult': False, 'gender': 2, 'id': 138, 'kno..."
3,"[{'adult': False, 'gender': 2, 'id': 117, 'kno..."
4,"[{'adult': False, 'gender': 2, 'id': 123, 'kno..."
...,...
9710,"[{'adult': False, 'gender': 1, 'id': 2186, 'kn..."
9711,"[{'adult': False, 'gender': 2, 'id': 2163, 'kn..."
9712,"[{'adult': False, 'gender': 2, 'id': 6213, 'kn..."
9713,"[{'adult': False, 'gender': 2, 'id': 1060, 'kn..."


In [435]:
#need to figure out how to break this out and then pull the direction values
#director_rows = test[test['known_for_department'] == 'Directing']