
# Preprocesamiento

In [36]:
import numpy as np
import pandas as pd


In [37]:
# Se leen los datos del archivo csv
data = pd.read_csv("MovieGenre.csv", encoding ="ISO-8859-1")
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [38]:
print(data.shape)
data.isnull().sum()

(40108, 6)


imdbId          0
Imdb Link       0
Title           0
IMDB Score     48
Genre         145
Poster        725
dtype: int64

In [39]:
# Se remueven las peliculas que no cuentan con un género especificado.
data = data.dropna(subset=["Genre"])
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [40]:
data.isnull().sum()

imdbId          0
Imdb Link       0
Title           0
IMDB Score     46
Genre           0
Poster        700
dtype: int64

In [41]:
def divide_list(n_chunks, list_to_divide):
    chunk_size = len(list_to_divide)//n_chunks + 1
    chunks = [list_to_divide[i:i+chunk_size] for i in range(0, len(list_to_divide), chunk_size)]
    
    return chunks

n_chunks = 10
chunks = divide_list(n_chunks, data)
for chunk in chunks:
    print(len(chunk))
    
data = chunks[0]
data.head()

3997
3997
3997
3997
3997
3997
3997
3997
3997
3990


Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [42]:
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [43]:
# Los datos faltantes del poster y score de las peliculas se obtendran por medio de un Scraping.
# Adicionalmente, se obtendrá el director, escritor y cast.
# Por último, se obtendrá el resumen como data de texto.

import requests
from bs4 import BeautifulSoup
import csv
import time

links = data['Imdb Link']
new_data = {}
new_data['Poster'] = data['Poster']
new_data['IMDB Score'] = data['IMDB Score']
new_data['Directors'] = []
new_data['Writers'] = []
new_data['Cast'] = []
new_data['Producers'] = []
new_data['Composers'] = []
new_data['Summary'] = []

def get_ids(content):
    return [block['href'].split('/')[2] for block in content.find_all("a")]

def get_credits_info(new_data, link):
    page = requests.get(link + "/fullcredits")
    soup = BeautifulSoup(page.content, "html.parser")    
    full_credits = soup.find("div", {"id": "fullcredits_content"})    
    block_titles = [title.contents[0].strip() for title in full_credits.find_all("h4")]
    block_content = full_credits.find_all("table")
    
    credits = zip(block_titles, block_content)
    for title, content in credits:
        elements = "|".join(get_ids(content)[:5])        
        if title == "Directed by":            
            new_data['Directors'].append(elements)
            
        if title == "Writing Credits":
            new_data['Writers'].append(elements)
        
        if title == "Cast":
            new_data['Cast'].append(elements)
            
        if title == "Produced by":
            new_data['Producers'].append(elements)
            
        if title == "Music by":
            new_data['Composers'].append(elements)
            
def get_text_info(new_data, link):
    page = requests.get(link + "/plotsummary")
    soup = BeautifulSoup(page.content, "html.parser")
    block = soup.find("ul", {"id":"plot-summaries-content"})    
    summaries = " ".join([summary.find("p").get_text() for summary in block.find_all("li")][:2])
    new_data['Summary'].append(summaries)
    
        
for link in links:
    time.sleep(2)    
    get_credits_info(new_data, link)
    get_text_info(new_data, link)  
    break
    
print(new_data['Composers'])
    
    
    
    #page = requests.get(link)
    #soup = BeautifulSoup(page.content, "html.parser")

    
    
    #get_text_info(new_data)    

    
    #storyline_block = soup.find(id="titleStoryLine")    
    #if storyline_block is not None:
    #    see_more_block = storyline_block.find("span",{'class':"see-more inline"})
    #    sm_link_blocks = see_more_block.find_all("a")
    #    for sm_link_block in sm_link_blocks:
    #        if sm_link_block.get_text() == "Plot Synopsis":
    #            synopsis_link = sm_link_block['href']
                
    #            page = requests.get(base_link + synopsis_link)
    #            soup = BeautifulSoup(page.content, 'html.parser')
    #            synopsis = soup.find(id="plot-synopsis-content").get_text()
    #            break
    
    
    #new_data['Directors'].append(get_directors(soup))
    #new_data['Writers'].append(get_writers(soup))
    #new_data['Cast'].append(get_cast(soup))
    


    


['nm0005271']
