In [53]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_full_text(actor_name):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'parse',
        'format': 'json',
        'page': actor_name,
        'prop': 'text',
        'redirects': '',
    }
    response = requests.get(url, params=params).json()
    raw_html = response['parse']['text']['*']
    return raw_html

def get_wiki_image_link(raw_html):
    ''''''
    soup = BeautifulSoup(raw_html, "html.parser")

    raw_image_link = soup.select_one("td", class_="infobox-image").select_one('img').get('src')

    wiki_image_link = f"https:{raw_image_link}"

    return wiki_image_link

def get_wiki_image_link_alt(raw_html):
    ''''''
    soup = BeautifulSoup(raw_html, "html.parser")

    infobox_image_tag = soup.find_all(lambda tag: tag.name == 'td' and 
                                   tag.get('class') == ['infobox-image'])[0]

    raw_image_link = infobox_image_tag.select_one('img').get('src')

    wiki_image_link = f"https:{raw_image_link}"

    return wiki_image_link

## Test

In [31]:
brad_pitt_full_text = get_full_text('Brad Pitt')

get_wiki_image_link(brad_pitt_full_text)

'https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Brad_Pitt_2019_by_Glenn_Francis.jpg/220px-Brad_Pitt_2019_by_Glenn_Francis.jpg'

In [41]:
full_text = get_full_text('Jet Li')

In [43]:
soup = BeautifulSoup(full_text, "html.parser")

In [52]:
# soup.select("td", class_="infobox-image")

soup.find_all(lambda tag: tag.name == 'td' and 
                                   tag.get('class') == ['infobox-image'])[0].select_one('img').get('src')

'//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Jet_Li_2009_%28cropped%29.jpg/220px-Jet_Li_2009_%28cropped%29.jpg'

In [6]:
actors_filtered = pd.read_csv("actors_filtered.csv",usecols=['nconst','primaryName']).drop_duplicates()
actors_filtered.head(3)

Unnamed: 0,nconst,primaryName
0,nm0000685,Jon Voight
1,nm1221047,Keegan-Michael Key
14,nm0004335,Hrithik Roshan


In [13]:
image_link_list = []
for idx, row in actors_filtered.iterrows():
    nconst = row['nconst']
    try:
        full_html_text = get_full_text(row['primaryName'])

        wiki_image_link = get_wiki_image_link(full_html_text)
    except:
        wiki_image_link = ''
    
    image_link_list.append((nconst, wiki_image_link))

In [14]:
image_link_list_df = pd.DataFrame(image_link_list, columns=['nconst','wiki_image_link'])

In [15]:
image_link_list_df.to_csv("image_link_list.csv", index=False)

In [17]:
image_link_list_w_name = image_link_list_df.merge(actors_filtered, on='nconst')
image_link_list_w_name.head(3)

Unnamed: 0,nconst,wiki_image_link,primaryName
0,nm0000685,https://upload.wikimedia.org/wikipedia/commons...,Jon Voight
1,nm1221047,https://upload.wikimedia.org/wikipedia/commons...,Keegan-Michael Key
2,nm0004335,https://upload.wikimedia.org/wikipedia/commons...,Hrithik Roshan


In [33]:
missing_image_link_list = []
for idx, row in image_link_list_w_name[image_link_list_w_name['wiki_image_link']==''].iterrows():
    nconst = row['nconst']
    try:
        full_html_text = get_full_text(f"{row['primaryName']} (actor)")

        wiki_image_link = get_wiki_image_link(full_html_text)
    except:
        wiki_image_link = ''
    
    missing_image_link_list.append((nconst, wiki_image_link))

In [39]:
missing_image_link_list_df = pd.DataFrame(missing_image_link_list, columns=['nconst','wiki_image_link']).merge(actors_filtered, on='nconst')
missing_image_link_list_df

Unnamed: 0,nconst,wiki_image_link,primaryName
0,nm0504897,,Tony Leung Chiu-wai
1,nm0898546,,Pruitt Taylor Vince
2,nm0001472,,Jet Li
3,nm0947447,,Donnie Yen
4,nm0000699,,Michael Wincott
5,nm0001601,,David Paymer
6,nm0001334,https://upload.wikimedia.org/wikipedia/commons...,John Heard
7,nm0947338,,Anton Yelchin
8,nm0262635,https://upload.wikimedia.org/wikipedia/commons...,Chris Evans
9,nm0004936,https://upload.wikimedia.org/wikipedia/commons...,Ben Foster


In [54]:
missing2_image_link_list = []
for idx, row in missing_image_link_list_df[missing_image_link_list_df['wiki_image_link']==''].iterrows():
    nconst = row['nconst']
    try:
        full_html_text = get_full_text(f"{row['primaryName']}")

        wiki_image_link = get_wiki_image_link_alt(full_html_text)
    except:
        wiki_image_link = ''
    
    missing2_image_link_list.append((nconst, wiki_image_link))

In [56]:
missing2_image_link_list_df = pd.DataFrame(missing2_image_link_list, columns=['nconst','wiki_image_link']).merge(actors_filtered, on='nconst')
missing2_image_link_list_df

Unnamed: 0,nconst,wiki_image_link,primaryName
0,nm0504897,https://upload.wikimedia.org/wikipedia/commons...,Tony Leung Chiu-wai
1,nm0898546,,Pruitt Taylor Vince
2,nm0001472,https://upload.wikimedia.org/wikipedia/commons...,Jet Li
3,nm0947447,https://upload.wikimedia.org/wikipedia/commons...,Donnie Yen
4,nm0000699,,Michael Wincott
5,nm0001601,,David Paymer
6,nm0947338,https://upload.wikimedia.org/wikipedia/commons...,Anton Yelchin


In [62]:
image_link_list_w_name_all = pd.concat([image_link_list_w_name[image_link_list_w_name['wiki_image_link']!='']
                                    ,missing_image_link_list_df[missing_image_link_list_df['wiki_image_link']!='']
                                    ,missing2_image_link_list_df]).reset_index(drop=True)

image_link_list_w_name_all.head(3)

Unnamed: 0,nconst,wiki_image_link,primaryName
0,nm0000685,https://upload.wikimedia.org/wikipedia/commons...,Jon Voight
1,nm1221047,https://upload.wikimedia.org/wikipedia/commons...,Keegan-Michael Key
2,nm0004335,https://upload.wikimedia.org/wikipedia/commons...,Hrithik Roshan


In [64]:
fill_missing_image = "https://upload.wikimedia.org/wikipedia/commons/2/25/Icon-round-Question_mark.jpg"

image_link_list_w_name_all['wiki_image_link'] = image_link_list_w_name_all['wiki_image_link'].apply(lambda x: x if x!='' else fill_missing_image)

In [65]:
image_link_list_w_name_all.head(3)

Unnamed: 0,nconst,wiki_image_link,primaryName
0,nm0000685,https://upload.wikimedia.org/wikipedia/commons...,Jon Voight
1,nm1221047,https://upload.wikimedia.org/wikipedia/commons...,Keegan-Michael Key
2,nm0004335,https://upload.wikimedia.org/wikipedia/commons...,Hrithik Roshan


In [66]:
image_link_list_w_name_all.to_csv("image_link_list_w_name_all.csv", index=False)