In [None]:
pip install tmdbsimple


In [1]:
import pandas as pd

In [2]:
import tmdbsimple as tmdb
import os
from dotenv import load_dotenv, find_dotenv


# Preparation

In [3]:
df = pd.read_json('/Users/beebo/code/rmelbardis/ObjectivelyFunny/raw_data/scraps_clean.json')
df.head()

Unnamed: 0,title,full_transcript,artist,show_name,year
0,Jim Gaffigan: Comedy Monster (2021) | Transcript,"Thank you! Thank you! Oh, my gosh. Thank you s...",Jim Gaffigan,Comedy Monster,2021.0
1,Louis C. K.: Sorry (2021) | Transcript,♪♪ [“Like a Rolling Stone” by Bob Dylan playin...,Louis C. K.,Sorry,2021.0
2,Drew Michael: Red Blue Green (2021) | Transcript,(EMOTIONAL MUSIC PLAYING)\n\n(MUSIC ENDS)\n\nD...,Drew Michael,Red Blue Green,2021.0
3,Mo Amer: Mohammed in Texas (2021) | Transcript,[quirky flute music playing]\n\n[single note p...,Mo Amer,Mohammed In Texas,2021.0
4,Dave Chappelle: The Closer (2021) | Transcript,[audience murmuring]\n\n[murmuring continues]\...,Dave Chappelle,The Closer,2021.0


In [4]:
df = df.drop(columns={'title', 'full_transcript'})
artist = df['artist']
df.head()

Unnamed: 0,artist,show_name,year
0,Jim Gaffigan,Comedy Monster,2021.0
1,Louis C. K.,Sorry,2021.0
2,Drew Michael,Red Blue Green,2021.0
3,Mo Amer,Mohammed In Texas,2021.0
4,Dave Chappelle,The Closer,2021.0


# Example of using tmdsimple

In [5]:
# point to api key in the .env file 
env_path = find_dotenv() 
load_dotenv(env_path)
tmdb.API_KEY = os.getenv('TMDB_KEY')

## skip this part just an illustration

### Step 1: searching for the actor's id

In [None]:
name = 'ADEL KARAM'

In [None]:
search = tmdb.Search()
response = search.person(query=name)
print([s['id'] for s in search.results])

### Step 2: using the id to search for info

In [None]:
identity = tmdb.People(53445)
response = identity.info()
print(identity.birthday, identity.gender, identity.place_of_birth)

handling multiple search results (usually the first one, but need to check)

In [None]:
response = search.person(query='ALI WONG')
[s['id'] for s in search.results]

In [None]:
identity = tmdb.People(591835)
response = identity.info()
print(identity.birthday, identity.gender, identity.place_of_birth)

## Putting things together

In [6]:
def extraction(names):
    '''
    Taking in a list of actor/actress' names
    Returning a dictionary containing their birthday, gender and birthplace
    '''
    
    name_dict = {'John Bishop': 216049, 'David Chappelle': 4169, 'Ryan Hamilton': 1595614
                , 'Emily Heller': 1475135, 'Brad Williams': 1231673, 'Neal': 543256
                , 'Overdoses – This Is Not Happening': 1238012}
    boys = ['Nate Bargatze', 'Doug Stanhope', 'Sam Jay', 'Andy Woodhull',
       'Brad Williams', 'Kavin Jay', 'Pablo Francisco', 'Michael Che']
    girls = ['Emily Heller', 'Gina Yashere']
    
    birthday = []
    gender = []
    birthplace = []
    
    check_list = []
    
    search = tmdb.Search()
    for name in names:
        if name in name_dict.keys():
            id_list = [name_dict[name]]
        else:
            response = search.person(query=name)
            id_list = [s['id'] for s in search.results]
        # if there is no id result
        if len(id_list) < 1:
            print(f'No {name} found. Please check the name.')
            birthday.append(None)
            gender.append(None)
            birthplace.append(None)
        # if there is more than one result
        else:
            if len(id_list) > 1 and name not in check_list:
                check_list.append(name)
#                 print(f'Multiple people found for {name}.')
            first_id = id_list[0]
            identity = tmdb.People(first_id)
            response = identity.info()
            
            birthday.append(identity.birthday)
            if name not in girls and name not in boys:
                gender.append(identity.gender)
            else:
                if name in girls:
                    gender.append(2)
                else:
                    gender.append(1)
            birthplace.append(identity.place_of_birth)
    return {'name': names, 'birthday': birthday, 'gender': gender, 'birthplace': birthplace}

In [7]:
new_dict = extraction([a for a in artist])

df['artist_birthday'] = pd.Series(new_dict['birthday'])
df['artist_gender'] = pd.Series(new_dict['gender'])

df.head()

Unnamed: 0,artist,show_name,year,artist_birthday,artist_gender
0,Jim Gaffigan,Comedy Monster,2021.0,1966-07-07,2
1,Louis C. K.,Sorry,2021.0,1967-09-12,2
2,Drew Michael,Red Blue Green,2021.0,,2
3,Mo Amer,Mohammed In Texas,2021.0,1981-07-24,2
4,Dave Chappelle,The Closer,2021.0,1973-08-24,2


In [8]:
df.isnull().sum()

artist              0
show_name           0
year                8
artist_birthday    28
artist_gender       0
dtype: int64

In [10]:
age_dict = {'Drew Michael': 1988,
 'Kathleen Madigan': 1965,
 'Phil Wang': 1990,
 'Joe List': 1982,
 'Nate Bargatze': 1979,
 'Sam Morril': 1986,
 'Sam Jay': 1982,
 'Roy Wood Jr.': 1978,
 'Chris Gethard': 1980,
 'Dave Allen': 1936,
 'Andy Woodhull': 1980,
 'Emily Heller': 1985,
 'Gina Yashere': 1974,
 'W. Kamau Bell': 1973,
 'Joe Mande': 1983,
 'Brad Williams': 1984,
 'Deray Davis': 1982,
 'Kavin Jay': 1980,
 'Ari Shaffir': 1974,
 'Fahim Anwar': 1984,
 'Jen Kirkman': 1974,
 'Neal Brennan': 1973,
 'Michael Che': 1983}

In [12]:
# filling in missing birthdays
for k,v in age_dict.items():
    df['artist_birthday'][df['artist'] == k] = str(v)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['artist_birthday'][df['artist'] == k] = str(v)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['artist_birthday'][df['artist'] == k] = str(v)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['artist_birthday'][df['artist'] == k] = str(v)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['artist_birthda

In [14]:
df['artist_birthday'] = df['artist_birthday'].str[:4]
df.head(20)

Unnamed: 0,artist,show_name,year,artist_birthday,artist_gender
0,Jim Gaffigan,Comedy Monster,2021.0,1966,2
1,Louis C. K.,Sorry,2021.0,1967,2
2,Drew Michael,Red Blue Green,2021.0,1988,2
3,Mo Amer,Mohammed In Texas,2021.0,1981,2
4,Dave Chappelle,The Closer,2021.0,1973,2
5,Kathleen Madigan,Bothering Jesus,2016.0,1965,1
6,Kathleen Madigan,Madigan Again,2013.0,1965,1
7,Phil Wang,Philly Philly Wang Wang,2021.0,1990,2
8,Dave Chappelle,8:46,2020.0,1973,2
9,Tom Papa,You’re Doing Great!,2020.0,1968,2


In [18]:
df['age_then'] = df.year - df.artist_birthday.astype(float)
df.head()

Unnamed: 0,artist,show_name,year,artist_birthday,artist_gender,age_then
0,Jim Gaffigan,Comedy Monster,2021.0,1966,2,55.0
1,Louis C. K.,Sorry,2021.0,1967,2,54.0
2,Drew Michael,Red Blue Green,2021.0,1988,2,33.0
3,Mo Amer,Mohammed In Texas,2021.0,1981,2,40.0
4,Dave Chappelle,The Closer,2021.0,1973,2,48.0


In [21]:
df[df.age_then.notnull()].sort_values('age_then')

Unnamed: 0,artist,show_name,year,artist_birthday,artist_gender,age_then
21,Bo Burnham,"Words, Words, Words",2010.0,1990,2,20.0
347,Eddie Murphy,Delirious,1983.0,1961,2,22.0
74,Pete Davidson,Smd,2016.0,1993,2,23.0
301,Bo Burnham,What,2013.0,1990,2,23.0
83,Kevin Bridges,The Story So Far… Live In Glasgow,2010.0,1986,2,24.0
...,...,...,...,...,...,...
96,George Carlin,Dumb Americans,2006.0,1937,2,69.0
355,George Carlin,Life Is Worth Losing,2006.0,1937,2,69.0
356,George Carlin,It’s Bad For Ya,2008.0,1937,2,71.0
249,Paul Mooney,A Piece Of My Mind Godbless America,2014.0,1941,2,73.0


In [22]:
pd.set_option("display.max_rows", None)
df.sort_values('artist')

Unnamed: 0,artist,show_name,year,artist_birthday,artist_gender,age_then
144,Larry The Cable Guy,We’ve Been Thinking,2016.0,1963,2,53.0
109,Adam Devine,Best Time Of Our Lives,2019.0,1983,2,36.0
145,Adam Sandler,100% Fresh,2018.0,1966,2,52.0
195,Adel Karam,Live From Beirut,2018.0,1972,2,46.0
273,Al Madrigal,Why Is The Rabbit Crying?,2013.0,1971,2,42.0
186,Ali Wong,Hard Knock Wife,2018.0,1982,1,36.0
278,Ali Wong,Baby Cobra,2016.0,1982,1,34.0
76,Amanda Seales,I Be Knowin’,2019.0,1981,1,38.0
321,Amy Schumer,The Leather Special,2017.0,1981,1,36.0
287,Amy Schumer,Live At The Apollo,2015.0,1981,1,34.0


In [31]:
df.loc[105]['artist'] = 'Tom Segura'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[105]['artist'] = 'Tom Segura'
