
# Preprocesamiento

In [48]:
import numpy as np
import pandas as pd


In [49]:
# Se leen los datos del archivo csv
data = pd.read_csv("MovieGenre.csv", encoding ="ISO-8859-1")
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [50]:
print(data.shape)
data.isnull().sum()

(40108, 6)


imdbId          0
Imdb Link       0
Title           0
IMDB Score     48
Genre         145
Poster        725
dtype: int64

In [51]:
# Se remueven las peliculas que no cuentan con alguno de los datos especificados
# Como la cantidad a eliminar es mucho menor a la cantidad total de los datos,
# se considera que el resultado total no se ve afectado en gran medida.
data = data.dropna(subset=["Genre"])
data = data.dropna(subset=["Poster"])
data = data.dropna(subset=["IMDB Score"])
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [52]:
data.isnull().sum()

imdbId        0
Imdb Link     0
Title         0
IMDB Score    0
Genre         0
Poster        0
dtype: int64

In [53]:
data = data[:20]
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [54]:
# Se obtendrá el director, escritor y cast por medio de un Scraping de la pag. de IMDB
# Por último, se obtendrá el resumen como data de texto.

import requests
from bs4 import BeautifulSoup
import csv
import time

links = data['Imdb Link']
new_data = {}
new_data['Poster'] = data['Poster']
new_data['IMDB Score'] = data['IMDB Score']
new_data['Directed by'] = []
new_data['Writing Credits'] = []
new_data['Cast'] = []
new_data['Produced by'] = []
new_data['Music by'] = []
new_data['Summary'] = []

keys = ["Directed by", "Writing Credits", "Cast", "Produced by", "Music by"]            

def get_ids(content):
    return [block['href'].split('/')[2] for block in content.find_all("a")]

def get_credits_info(new_data, link):
    try:
        page = requests.get(link + "/fullcredits")
        soup = BeautifulSoup(page.content, "html.parser")    
        full_credits = soup.find("div", {"id": "fullcredits_content"})            
        block_titles = [title.contents[0].strip() for title in full_credits.find_all("h4")]
        block_content = full_credits.find_all("table")
        
        for key in keys:
            found_key = False
            for title, content in zip(block_titles, block_content):                
                if title == key:            
                    found_key = True
                    elements = "|".join(get_ids(content)[:5])        
                    if title == "Directed by" or \
                        title == "Writing Credits" or \
                        title == "Produced by" or \
                        title == "Music by":                            
                        new_data[title].append(elements)

                    if title == "Cast":
                        elements = "|".join([block.find('a')['href'].split('/')[2]
                                    for block in content.find_all("td", {"itemprop":"actor"})][:5])
                        new_data['Cast'].append(elements)
            
            if not found_key:
                new_data[key].append("")
    except:
        print("Error en el link: ", link + "/fullcredits")
        for key in keys:
            new_data[key].append("")
            
def get_text_info(new_data, link):
    try:
        page = requests.get(link + "/plotsummary")
        soup = BeautifulSoup(page.content, "html.parser")
        block = soup.find("ul", {"id":"plot-summaries-content"})    
        summaries = " ".join([summary.find("p").get_text() for summary in block.find_all("li")][:2])
        new_data['Summary'].append(summaries)
    except:
        print("Error en el link: ", link + "/plotsummary")
        new_data['Summary'].append("")
    
        
for link in links:
    time.sleep(1)
    get_credits_info(new_data, link)
    get_text_info(new_data, link)          


In [62]:
for key in new_data:
    data[key] = new_data[key]

In [63]:
directors = data['Directed by'].str.split('|').apply(pd.Series).astype(str)
directors.rename(columns={0:'director1', 1:'director2'}, inplace=True)
directors = directors.iloc[:, 0: 2]

writers = data['Writing Credits'].str.split('|').apply(pd.Series).astype(str)
writers.rename(columns={0:'writer1', 1:'writer2'}, inplace=True)
writers = writers.iloc[:, 0: 2]

cast = data['Cast'].str.split('|').apply(pd.Series).astype(str)
cast.rename(columns={0:'cast1', 1:'cast2', 2:'cast3', 3:'cast4', 4:'cast5'}, inplace=True)
cast = cast.iloc[:,0:5]

producers = data['Produced by'].str.split('|').apply(pd.Series).astype(str)
producers.rename(columns={0:'producer1', 1:'producer2'}, inplace=True)
producers = producers.iloc[:,0:2]

composers = data['Music by'].str.split('|').apply(pd.Series).astype(str)
composers.rename(columns={0:'composer1', 1:'composer2'}, inplace=True)
composers = composers.iloc[:,0:2]

data = pd.concat([data[['imdbId','Title','Genre','Poster', 'Summary']], directors, writers, cast, producers, composers], axis=1)
data.head()

In [73]:
import urllib
from urllib.request import urlretrieve

path = "/home/howl/Img/poster"

for idx, link in enumerate(data['Poster']):
    urlretrieve(link, path + str(idx) + ".jpg")
