# Load Libraries

In [1]:
from PIL import Image
import requests
from bs4 import BeautifulSoup
import os
import numpy as np
import pandas as pd
import nltk
# Load Stop Words
nltk.download('stopwords')
# Load English Words
nltk.download('words')
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shrijeet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/shrijeet/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Crawl the Indian Movie Celebrities Personality Traits With Images From IMDB

In [2]:

site = 'https://www.imdb.com'

df_celebrities = pd.DataFrame(columns = ['Name' , 'Image', 'Biography'])

for i in range(1, 3):
    
    url = "https://www.imdb.com/list/ls025929404/?sort=list_order,asc&mode=detail&page="+str(i)

    print("URL",url)

    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')


    for link in soup.find(class_="lister list detail sub-list").find_all(["a"]):
        #image = soup.find('img')
        
        if (link.get("href") != None or link.get("href") != '') and (link.get("href").count('/') >= 1):

            if link.get("href").split('/')[1] == 'name' and link.get("href").count('/') == 3:

                #print("href: {}".format(link.get("href")+"bio"))

                res = requests.get(site+link.get("href")+"bio")

                sp = BeautifulSoup(res.text, 'html.parser')

                #print(sp)

                name = sp.find(class_="parent").find_all(["a"])[0].text.strip()

                image_url = sp.find(class_="subpage_title_block name-subpage-header-block").find_all(["img"])[0].get("src")

                bio = sp.find(class_="soda odd").find_all(["p"])[0].text.replace('\n', ' ').strip()
                #print("Bio ", bio)


                img = Image.open(requests.get(image_url, stream = True).raw)

                dirs = 'Images' #os.path.abspath('Images')
                if not os.path.exists(dirs):
                    os.mkdir(dirs)

                img_path = dirs+'/'+name.replace(" ", "_").lower()+'.jpg'

                img.save(img_path)

                df_celebrities = df_celebrities.append({'Name' : name , 'Image' : img_path, 'Biography' : bio}, ignore_index=True)


URL https://www.imdb.com/list/ls025929404/?sort=list_order,asc&mode=detail&page=1
URL https://www.imdb.com/list/ls025929404/?sort=list_order,asc&mode=detail&page=2


In [3]:
df_celebrities.head()

Unnamed: 0,Name,Image,Biography
0,Dilip Kumar,Images/dilip_kumar.jpg,Dilip Kumar is considered to be one of the gre...
1,Raj Kapoor,Images/raj_kapoor.jpg,Raj Kapoor was the son of well-known Indian ac...
2,Dev Anand,Images/dev_anand.jpg,After having played the Lead Actor for more th...
3,Amitabh Bachchan,Images/amitabh_bachchan.jpg,Son of well known poet Harivansh Rai Bachchan ...
4,Salman Khan,Images/salman_khan.jpg,"Born on December 27, 1965 Salman Khan is the s..."


# Save Data In CSV Format

In [4]:
df_celebrities.to_csv('celebrities_data.csv', index=False)

# Save Data In SQLite3 Database

In [5]:
import sqlite3
conn = sqlite3.connect('DBCelebrities.sqlite3')
df_celebrities.to_sql('Celebrities_Bio', conn, if_exists='replace', index=False)

# Load Data From CSV

In [6]:
df_celebrities = pd.read_csv('celebrities_data.csv')
df_celebrities.head()

Unnamed: 0,Name,Image,Biography
0,Dilip Kumar,Images/dilip_kumar.jpg,Dilip Kumar is considered to be one of the gre...
1,Raj Kapoor,Images/raj_kapoor.jpg,Raj Kapoor was the son of well-known Indian ac...
2,Dev Anand,Images/dev_anand.jpg,After having played the Lead Actor for more th...
3,Amitabh Bachchan,Images/amitabh_bachchan.jpg,Son of well known poet Harivansh Rai Bachchan ...
4,Salman Khan,Images/salman_khan.jpg,"Born on December 27, 1965 Salman Khan is the s..."


# Remove Special Characters and Numbers From Biography

In [7]:
df_celebrities_1 = df_celebrities.copy()

df_celebrities_1['Cleaned_Bio'] = df_celebrities['Biography'].str.replace(r"[^a-zA-Z ]+", "").str.lower().str.strip()
df_celebrities_1.head()

Unnamed: 0,Name,Image,Biography,Cleaned_Bio
0,Dilip Kumar,Images/dilip_kumar.jpg,Dilip Kumar is considered to be one of the gre...,dilip kumar is considered to be one of the gre...
1,Raj Kapoor,Images/raj_kapoor.jpg,Raj Kapoor was the son of well-known Indian ac...,raj kapoor was the son of wellknown indian act...
2,Dev Anand,Images/dev_anand.jpg,After having played the Lead Actor for more th...,after having played the lead actor for more th...
3,Amitabh Bachchan,Images/amitabh_bachchan.jpg,Son of well known poet Harivansh Rai Bachchan ...,son of well known poet harivansh rai bachchan ...
4,Salman Khan,Images/salman_khan.jpg,"Born on December 27, 1965 Salman Khan is the s...",born on december salman khan is the son of t...


In [8]:
print(len(df_celebrities_1['Cleaned_Bio'][0]))
print(df_celebrities_1['Cleaned_Bio'][0])

8601
dilip kumar is considered to be one of the greatest actors of indian cinema starting his career in  he has starred in some of the biggest commercially successful films in the period   he was the first actor to receive a filmfare best actor award and holds the record for most number of filmfare awards won for that category though he has done films of other genres occasionally like  he balanced out with roles such as the intense andaz  with the swashbuckling aan  the dramatic devdas  with the comical azaad  and the historical romance mughal e azam  with the social ganga jamuna  he predominantly specialized in doing love stories or tragic roles from  from late s roles dried up for kumar as films starring dev anand rajendra kumar and shammi kapoor were more successful from  and also dilips films from  to  were box office flops like dil diya dard liya sunghursh aadmi dastaan gopi sagina and bairaag and after  he left films for a five year break after rajesh khanna became first supersta

# Remove Stop Words From Biography

In [9]:
stop_words = stopwords.words('english')
print(stop_words)
print(len(stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
df_celebrities_2 = df_celebrities_1.copy()

df_celebrities_2['Cleaned_Bio'] = df_celebrities_1['Cleaned_Bio'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
df_celebrities_2.head()

Unnamed: 0,Name,Image,Biography,Cleaned_Bio
0,Dilip Kumar,Images/dilip_kumar.jpg,Dilip Kumar is considered to be one of the gre...,dilip kumar considered one greatest actors ind...
1,Raj Kapoor,Images/raj_kapoor.jpg,Raj Kapoor was the son of well-known Indian ac...,raj kapoor son wellknown indian actor prithvir...
2,Dev Anand,Images/dev_anand.jpg,After having played the Lead Actor for more th...,played lead actor five decades motion pictures...
3,Amitabh Bachchan,Images/amitabh_bachchan.jpg,Son of well known poet Harivansh Rai Bachchan ...,son well known poet harivansh rai bachchan tej...
4,Salman Khan,Images/salman_khan.jpg,"Born on December 27, 1965 Salman Khan is the s...",born december salman khan son legendary writer...


In [11]:
print(len(df_celebrities_2['Cleaned_Bio'][0]))
print(df_celebrities_2['Cleaned_Bio'][0])

6202
dilip kumar considered one greatest actors indian cinema starting career starred biggest commercially successful films period first actor receive filmfare best actor award holds record number filmfare awards category though done films genres occasionally like balanced roles intense andaz swashbuckling aan dramatic devdas comical azaad historical romance mughal e azam social ganga jamuna predominantly specialized love stories tragic roles late roles dried kumar films starring dev anand rajendra kumar shammi kapoor successful also dilips films box office flops like dil diya dard liya sunghursh aadmi dastaan gopi sagina bairaag left films five year break rajesh khanna became first superstar indian cinema author backed lead roles went rajesh khanna dilip returned character role insistence manoj kumar blockbuster film kranti continued career playing central character roles multihero films shakti karma vidhaata mazdoor mashaal duniya dharm adhikari kanoon apna apna izzatdaar saudagar qi

# Stemme Words in Biography

In [12]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

In [13]:
df_celebrities_3 = df_celebrities_2.copy()

df_celebrities_3['Cleaned_Bio'] = df_celebrities_2['Cleaned_Bio'].apply(lambda x: ' '.join([stemmer.stem(item) for item in x.split()]))
df_celebrities_3.head()

Unnamed: 0,Name,Image,Biography,Cleaned_Bio
0,Dilip Kumar,Images/dilip_kumar.jpg,Dilip Kumar is considered to be one of the gre...,dilip kumar consid one greatest actor indian c...
1,Raj Kapoor,Images/raj_kapoor.jpg,Raj Kapoor was the son of well-known Indian ac...,raj kapoor son wellknown indian actor prithvir...
2,Dev Anand,Images/dev_anand.jpg,After having played the Lead Actor for more th...,play lead actor five decad motion pictur dev a...
3,Amitabh Bachchan,Images/amitabh_bachchan.jpg,Son of well known poet Harivansh Rai Bachchan ...,son well known poet harivansh rai bachchan tej...
4,Salman Khan,Images/salman_khan.jpg,"Born on December 27, 1965 Salman Khan is the s...",born decemb salman khan son legendari writer s...


In [14]:
print(len(df_celebrities_3['Cleaned_Bio'][0]))
print(df_celebrities_3['Cleaned_Bio'][0])

5706
dilip kumar consid one greatest actor indian cinema start career star biggest commerci success film period first actor receiv filmfar best actor award hold record number filmfar award categori though done film genr occasion like balanc role intens andaz swashbuckl aan dramat devda comic azaad histor romanc mughal e azam social ganga jamuna predomin special love stori tragic role late role dri kumar film star dev anand rajendra kumar shammi kapoor success also dilip film box offic flop like dil diya dard liya sunghursh aadmi dastaan gopi sagina bairaag left film five year break rajesh khanna becam first superstar indian cinema author back lead role went rajesh khanna dilip return charact role insist manoj kumar blockbust film kranti continu career play central charact role multihero film shakti karma vidhaata mazdoor mashaal duniya dharm adhikari kanoon apna apna izzatdaar saudagar qila film success box offic kranti vidhaata karma dharm adhikari kanoon apna apna saudagar retir indi

In [15]:
stemmer.stem("played")

'play'

# Remove Non-English Words From Biography

In [16]:
english_words = set(nltk.corpus.words.words())

In [17]:
df_celebrities_4 = df_celebrities_3.copy()

df_celebrities_4['Cleaned_Bio'] = df_celebrities_3['Cleaned_Bio'].apply(lambda x: ' '.join([item for item in x.split() if item in english_words]))
df_celebrities_4.head()

Unnamed: 0,Name,Image,Biography,Cleaned_Bio
0,Dilip Kumar,Images/dilip_kumar.jpg,Dilip Kumar is considered to be one of the gre...,one actor cinema start career star biggest suc...
1,Raj Kapoor,Images/raj_kapoor.jpg,Raj Kapoor was the son of well-known Indian ac...,raj son actor act film stage product studio ye...
2,Dev Anand,Images/dev_anand.jpg,After having played the Lead Actor for more th...,play lead actor five decad motion dev cinema t...
3,Amitabh Bachchan,Images/amitabh_bachchan.jpg,Son of well known poet Harivansh Rai Bachchan ...,son well known poet brother name move find wor...
4,Salman Khan,Images/salman_khan.jpg,"Born on December 27, 1965 Salman Khan is the s...",born khan son writer khan pen mani yesteryear ...


In [18]:
print(len(df_celebrities_4['Cleaned_Bio'][0]))
print(df_celebrities_4['Cleaned_Bio'][0])

2612
one actor cinema start career star biggest success film period first actor best actor award hold record number award though done film occasion like role comic e social ganga special love tragic role late role film star dev success also film box flop like left film five year break first cinema author back lead role went return role insist film career play central role film shakti karma kanoon film success box karma kanoon film born khan bazaar speak father fruit merchant own orchard near khan move start canteen spot lead actress year rani also wife founder help film also gave screen name first film went next film box flop shot film first major hit follow success next box flop fell love hit gave hit like paar bigger star even first film pair hit name crush ever first film hit costar raj flop director flop actor film melodrama film went huge success made star throughout one biggest star along raj dev dev flop star hit raj actor chang hit thus among trio first star second dev raj know

# Remove Non Adjectives From Biography

In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [20]:
about_doc = nlp(df_celebrities_1['Cleaned_Bio'][0])

In [21]:
df_celebrities_5 = df_celebrities_4.copy()

df_celebrities_5['Cleaned_Bio'] = df_celebrities_4['Cleaned_Bio'].apply(lambda x: ' '.join([str(item) for item in nlp(x) if item.pos_=='ADJ']))
df_celebrities_5.head()

Unnamed: 0,Name,Image,Biography,Cleaned_Bio
0,Dilip Kumar,Images/dilip_kumar.jpg,Dilip Kumar is considered to be one of the gre...,biggest first best special tragic late first c...
1,Raj Kapoor,Images/raj_kapoor.jpg,Raj Kapoor was the son of well-known Indian ac...,new next direct popular social close moral new...
2,Dev Anand,Images/dev_anand.jpg,After having played the Lead Actor for more th...,new magic present new new intern best best for...
3,Amitabh Bachchan,Images/amitabh_bachchan.jpg,Son of well known poet Harivansh Rai Bachchan ...,poet fairer fair enough deep accomplish live p...
4,Salman Khan,Images/salman_khan.jpg,"Born on December 27, 1965 Salman Khan is the s...",main local black possess roguish tri perform g...


In [22]:
print(len(df_celebrities_5['Cleaned_Bio'][0]))
print(df_celebrities_5['Cleaned_Bio'][0])

293
biggest first best special tragic late first cinema lead own first next first major next bigger first first huge biggest tragic famous tragic comic highest gross hum real lean small big next dual twin biggest first cinema solo biggest elder patriarch last major last last rare due younger last


# Generate CountVectorizer of  Biography

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_celebrities_5['Cleaned_Bio'])

count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())

df_celebrities_6 = pd.concat([df_celebrities_5, count_vect_df], axis=1)

In [25]:
df_celebrities_6.head()

Unnamed: 0,Name,Image,Biography,Cleaned_Bio,accept,accomplish,activist,apart,arjun,assist,...,vibrant,villain,visual,weak,whole,wide,wise,wrong,young,younger
0,Dilip Kumar,Images/dilip_kumar.jpg,Dilip Kumar is considered to be one of the gre...,biggest first best special tragic late first c...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Raj Kapoor,Images/raj_kapoor.jpg,Raj Kapoor was the son of well-known Indian ac...,new next direct popular social close moral new...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Dev Anand,Images/dev_anand.jpg,After having played the Lead Actor for more th...,new magic present new new intern best best for...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Amitabh Bachchan,Images/amitabh_bachchan.jpg,Son of well known poet Harivansh Rai Bachchan ...,poet fairer fair enough deep accomplish live p...,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,Salman Khan,Images/salman_khan.jpg,"Born on December 27, 1965 Salman Khan is the s...",main local black possess roguish tri perform g...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
print(vectorizer.get_feature_names())

['accept', 'accomplish', 'activist', 'apart', 'arjun', 'assist', 'back', 'bad', 'basic', 'best', 'better', 'big', 'bigger', 'biggest', 'black', 'bose', 'boyish', 'calm', 'candid', 'casual', 'central', 'chic', 'chief', 'choreograph', 'chronic', 'cinema', 'civil', 'civilian', 'classic', 'close', 'comedian', 'comic', 'common', 'cordial', 'critic', 'current', 'cute', 'dark', 'dead', 'deep', 'design', 'direct', 'drunk', 'dual', 'due', 'eighth', 'elder', 'eldest', 'enough', 'entertain', 'equal', 'essay', 'establish', 'ethic', 'even', 'express', 'fair', 'fairer', 'famous', 'far', 'fell', 'fellow', 'final', 'fine', 'first', 'flat', 'flawless', 'foreign', 'foremost', 'formal', 'former', 'fourth', 'frank', 'free', 'frequent', 'full', 'fundamentalist', 'further', 'global', 'golden', 'good', 'gradual', 'grand', 'great', 'gross', 'grosser', 'guest', 'hangman', 'hard', 'harish', 'heroic', 'high', 'higher', 'highest', 'honest', 'hot', 'houseful', 'huge', 'hum', 'ill', 'impair', 'intact', 'intern', 'k

# Save Data In CSV Format

In [27]:
df_celebrities.to_csv('celebrities_personality_data.csv', index=False)

# Save Data In SQLite3 Database

In [28]:
import sqlite3
conn = sqlite3.connect('DBCelebrities.sqlite3')
df_celebrities.to_sql('Celebrities_Personality_Data', conn, if_exists='replace', index=False)

In [29]:
# nouns = []
# adjectives = []
# for token in about_doc:
#     if token.pos_ == 'ADJ':
#         adjectives.append(token)
    
#     if token.pos_ == 'NOUN':
#         nouns.append(token)

In [30]:
# print(len(adjectives))
# adjectives