
# Analisis de Datos


In [40]:
import numpy as np
import pandas as pd
import re

In [41]:
# Se leen los datos del archivo csv
data = pd.read_csv("MovieGenre.csv", encoding ="ISO-8859-1")
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [42]:
# Se separa la informacion de Titulo y año
titles = []
years = []

for title in data['Title']:
    try:        
        years.append(re.findall(r'\((.*?)\)', title)[0])    
        titles.append(title[:-6].strip())
    except:        
        years.append(None)
        titles.append(title)        
        
data['Title'] = titles
data['Year'] = years


In [43]:
print(data.shape)
data.isnull().sum()

(40108, 7)


imdbId          0
Imdb Link       0
Title           0
IMDB Score     48
Genre         145
Poster        725
Year          505
dtype: int64

In [44]:
# Se remueven las peliculas que no cuentan con alguno de los datos especificados
# Como la cantidad a eliminar es mucho menor a la cantidad total de los datos,
# se considera que el resultado total no se ve afectado en gran medida.
data = data.dropna(subset=["Genre"])
data = data.dropna(subset=["Poster"])
data = data.dropna(subset=["IMDB Score"])
data = data.dropna(subset=["Year"])
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster,Year
0,114709,http://www.imdb.com/title/tt114709,Toy Story,8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...,1995
1,113497,http://www.imdb.com/title/tt113497,Jumanji,6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...,1995
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men,6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...,1995
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale,5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...,1995
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II,5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...,1995


In [45]:
data.isnull().sum()

imdbId        0
Imdb Link     0
Title         0
IMDB Score    0
Genre         0
Poster        0
Year          0
dtype: int64

In [47]:
data = data[:20]


# Scraping de los datos

In [15]:
# Se obtendrá el director, escritor y cast por medio de un Scraping de la pag. de IMDB
# Por último, se obtendrá el resumen como data de texto.

import requests
from bs4 import BeautifulSoup
import csv
import time

links = data['Imdb Link']
new_data = {}
new_data['Poster'] = data['Poster']
new_data['IMDB Score'] = data['IMDB Score']
new_data['Directed by'] = []
new_data['Writing Credits'] = []
new_data['Cast'] = []
new_data['Produced by'] = []
new_data['Music by'] = []
new_data['Summary'] = []

keys = ["Directed by", "Writing Credits", "Cast", "Produced by", "Music by"]            

def get_ids(content):
    return [block['href'].split('/')[2] for block in content.find_all("a")]

def get_credits_info(new_data, link):
    try:
        page = requests.get(link + "/fullcredits")
        soup = BeautifulSoup(page.content, "html.parser")    
        full_credits = soup.find("div", {"id": "fullcredits_content"})            
        block_titles = [title.contents[0].strip() for title in full_credits.find_all("h4")]
        block_content = full_credits.find_all("table")
        
        for key in keys:
            found_key = False
            for title, content in zip(block_titles, block_content):                
                if title == key:            
                    found_key = True
                    elements = "|".join(get_ids(content)[:5])        
                    if title == "Directed by" or \
                        title == "Writing Credits" or \
                        title == "Produced by" or \
                        title == "Music by":                            
                        new_data[title].append(elements)

                    if title == "Cast":
                        elements = "|".join([block.find('a')['href'].split('/')[2]
                                    for block in content.find_all("td", {"itemprop":"actor"})][:5])
                        new_data['Cast'].append(elements)
            
            if not found_key:
                new_data[key].append("")
    except:
        print("Error en el link: ", link + "/fullcredits")
        for key in keys:
            new_data[key].append("")
            
def get_text_info(new_data, link):
    try:
        page = requests.get(link + "/plotsummary")
        soup = BeautifulSoup(page.content, "html.parser")
        block = soup.find("ul", {"id":"plot-summaries-content"})    
        summaries = " ".join([summary.find("p").get_text() for summary in block.find_all("li")][:2])
        new_data['Summary'].append(summaries)
    except:
        print("Error en el link: ", link + "/plotsummary")
        new_data['Summary'].append("")
    
        
for link in links:
    time.sleep(1)
    get_credits_info(new_data, link)
    get_text_info(new_data, link)          


In [16]:
for key in new_data:
    data[key] = new_data[key]

In [17]:
directors = data['Directed by'].str.split('|').apply(pd.Series).astype(str)
directors.rename(columns={0:'director1', 1:'director2'}, inplace=True)
directors = directors.iloc[:, 0: 2]

writers = data['Writing Credits'].str.split('|').apply(pd.Series).astype(str)
writers.rename(columns={0:'writer1', 1:'writer2'}, inplace=True)
writers = writers.iloc[:, 0: 2]

cast = data['Cast'].str.split('|').apply(pd.Series).astype(str)
cast.rename(columns={0:'cast1', 1:'cast2', 2:'cast3', 3:'cast4', 4:'cast5'}, inplace=True)
cast = cast.iloc[:,0:5]

producers = data['Produced by'].str.split('|').apply(pd.Series).astype(str)
producers.rename(columns={0:'producer1', 1:'producer2'}, inplace=True)
producers = producers.iloc[:,0:2]

composers = data['Music by'].str.split('|').apply(pd.Series).astype(str)
composers.rename(columns={0:'composer1', 1:'composer2'}, inplace=True)
composers = composers.iloc[:,0:2]

data = pd.concat([data[['imdbId','Title','Genre','Poster', 'Summary']], directors, writers, cast, producers, composers], axis=1)
data.head()

Unnamed: 0,imdbId,Title,Genre,Poster,Summary,director1,director2,writer1,writer2,cast1,cast2,cast3,cast4,cast5,producer1,producer2,composer1
0,114709,Toy Story (1995),Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...,A little boy named Andy loves to be in his roo...,nm0005124,,nm0005124,nm0230032,nm0000158,nm0000741,nm0725543,nm0001815,nm0001728,nm0036366,nm0146216,nm0005271
1,113497,Jumanji (1995),Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...,After being trapped in a jungle board game for...,nm0002653,,nm0378144,nm0852430,nm0000245,nm0404993,nm0000379,nm0682300,nm0001372,nm0181202,nm0276059,nm0000035
2,113228,Grumpier Old Men (1995),Comedy|Romance,https://images-na.ssl-images-amazon.com/images...,Things don't seem to change much in Wabasha Co...,nm0222043,,nm0425756,nm0425756,nm0000527,nm0000493,nm0000047,nm0000268,nm0580565,nm0075828,nm0204862,nm0006293
3,114885,Waiting to Exhale (1995),Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...,This story based on the best selling novel by ...,nm0001845,,nm0573334,nm0573334,nm0001365,nm0000291,nm0222643,nm0005375,nm0002138,nm0060103,nm0433845,nm0004892
4,113041,Father of the Bride Part II (1995),Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...,"In this sequel to ""Father of the Bride"", Georg...",nm0796124,,nm0352443,nm0329304,nm0000188,nm0000473,nm0001737,nm0931090,nm0627624,nm0062071,nm0088692,nm0006293


# Procesamiento de Imagenes

In [73]:
import urllib
from urllib.request import urlretrieve

filenames = []
path = "/home/alulab/data/poster/poster"
for idx, link in enumerate(data['Poster']):
    name = path + str(idx) + ".jpg"    
    filenames.append(name)
    urlretrieve(link, name)
    
poster_df = data[['imdbId']].copy()
poster_df['Path'] = filenames   

In [57]:
from scipy import misc
n_group=10 # 10 posters por archivo
total_groups=data['Poster'].shape[0]//n_group ## total group number
directory="/home/alulab/data/poster/"

n_color=3
width=182
height=268
for i in range(total_groups):
    matrix=np.zeros((n_group,width*height*n_color+1),dtype=int)        
    for j in range(n_group):
        imdbId = poster_df.iloc[i*n_group + j, 0]
        filename = directory + "poster" + str(i*n_group + j) + ".jpg"
        try:
            arr = misc.imread(filename)
        except:
            continue
            
        matrix[j,0]=imdbId 
        trans_height=min(height,arr.shape[0])
        if len(arr.shape)==3:
            for c in range(n_color):
                for h in range(trans_height):
                    ind_start=c*height*width+1+h*width
                    ind_end=ind_start+width
                    matrix[j,ind_start:ind_end]=arr[h,:,c]
        else:
            for c in range(n_color):
                for h in range(trans_height):
                    ind_start=c*height*width+1+h*width
                    ind_end=ind_start+width
                    matrix[j,ind_start:ind_end]=arr[h,:]
                    
    target_name="/home/alulab/data/clean_data/poster_matrices/poster_matrix_"+str(i)+".csv"            
    with open(target_name, 'wb') as f:
        np.savetxt(f,matrix,fmt='%d',delimiter=",")
    print(i)

0
1


In [61]:
matrix = np.loadtxt('/home/alulab/data/clean_data/poster_matrices/poster_matrix_0.csv', dtype=int, delimiter=',')
matrix_df = pd.DataFrame(matrix)
matrix_df.rename(columns={0:'imdbId'}, inplace=True)
matrix_df

Unnamed: 0,imdbId,1,2,3,4,5,6,7,8,9,...,146319,146320,146321,146322,146323,146324,146325,146326,146327,146328
0,114709,2,4,5,6,5,4,2,2,2,...,43,42,30,23,18,17,17,15,16,15
1,113497,12,10,0,9,9,14,4,12,10,...,9,9,9,9,9,9,9,9,9,8
2,113228,218,213,212,214,219,221,219,216,212,...,37,37,42,40,39,38,39,40,37,33
3,114885,254,251,248,248,251,251,249,245,250,...,193,195,198,200,196,196,197,197,198,197
4,113041,245,246,248,247,246,244,243,242,245,...,0,0,0,0,0,0,0,0,0,0
5,113277,2,1,1,0,1,2,4,5,5,...,12,12,12,11,10,9,8,7,6,6
6,114319,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
7,112302,181,182,182,182,181,181,182,183,183,...,77,93,100,95,111,108,95,97,108,65
8,114576,10,11,12,12,9,7,7,7,7,...,4,4,3,3,4,1,1,2,2,2
9,113189,8,8,8,10,10,10,12,12,13,...,16,15,15,17,16,16,15,15,15,15


# Text Analysis

In [72]:
import re
data = pd.read_csv('Data.csv')

years = []
for title in data['Title']:
    year = re.findall(r'\((.*?)\)', title)
    try:
        years.append(year[0])
    except:
        years.append(None)
    
data['Year'] = years


In [79]:
text_data = data[["imdbId"]].copy()



text_data['text'] = data['Title'].astype(str) + ' ' + data['Summary'].astype(str)

text_data.head()


Unnamed: 0,imdbId,text
0,114709,Toy Story (1995) A little boy named Andy loves...
1,113497,Jumanji (1995) After being trapped in a jungle...
2,113228,Grumpier Old Men (1995) Things don't seem to c...
3,114885,Waiting to Exhale (1995) This story based on t...
4,113041,Father of the Bride Part II (1995) In this seq...


In [104]:
# create bag-of-words matrix
from sklearn.feature_extraction.text import TfidfVectorizer
# ignore stop words and only consider words that make up at least 1% of the corpus
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.04, max_df=0.5)
corpus = text_data['text'].values
X = vectorizer.fit_transform(corpus)
vocab = vectorizer.get_feature_names()
len(vocab)

218

In [105]:
print(vocab)

['1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', 'able', 'affair', 'american', 'arrives', 'away', 'bad', 'based', 'beautiful', 'begin', 'begins', 'believe', 'believes', 'best', 'big', 'black', 'body', 'boss', 'boy', 'boyfriend', 'bring', 'brother', 'business', 'called', 'car', 'career', 'case', 'chance', 'child', 'children', 'city', 'come', 'comes', 'company', 'country', 'couple', 'crime', 'daughter', 'day', 'days', 'dead', 'deal', 'death', 'decide', 'decides', 'despite', 'different', 'discover', 'discovers', 'does', 'doesn', 'don', 'dr', 'end', 'ends', 'escape', 'especially', 'eventually', 'evil', 'ex', 'face', 'fall', 'falls', 'family', 'father', 'fight', 'film', 'finally', 'finds', 'follows', 'friend', 'friends', 'future', 'gets', 'getting', 'girl', 'girlfriend', 'goes', 'going', 'good', 'great', 'group', 'hard', 'having', 'head', 'help', 'high', 'home', 'house', 'husband', 'including', 'involved', 'job', 'john', 'just', 'kill', 'killed', 'killer', 'know', 'known', '

In [106]:
bagofwords = pd.DataFrame(X.toarray(), columns=vocab)
bagofwords = bagofwords.add_prefix('word_')
bagofwords = pd.concat([text_data[['imdbId']], bagofwords], axis=1)
bagofwords

Unnamed: 0,imdbId,word_1993,word_1994,word_1995,word_1996,word_1997,word_1998,word_1999,word_2000,word_able,...,word_woman,word_women,word_work,word_working,word_works,word_world,word_year,word_years,word_york,word_young
0,114709,0.0,0.0,0.128642,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.091360
1,113497,0.0,0.0,0.217341,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.359427,0.000000,0.000000
2,113228,0.0,0.0,0.157154,0.0,0.0,0.0,0.0,0.0,0.181002,...,0.000000,0.173770,0.000000,0.00000,0.00000,0.000000,0.000000,0.129947,0.000000,0.000000
3,114885,0.0,0.0,0.135542,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.599490,0.000000,0.00000,0.00000,0.113418,0.000000,0.000000,0.000000,0.000000
4,113041,0.0,0.0,0.173320,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
5,113277,0.0,0.0,0.153535,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.142675,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
6,114319,0.0,0.0,0.103821,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.176922,0.000000,0.000000,0.00000,0.00000,0.173751,0.000000,0.171695,0.000000,0.000000
7,112302,0.0,0.0,0.160889,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.228524
8,114576,0.0,0.0,0.304253,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
9,113189,0.0,0.0,0.269129,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.250093,0.00000,0.00000,0.225202,0.000000,0.222536,0.000000,0.000000


In [107]:
bagofwords.to_csv('bagofwords_vars.csv')

In [108]:
data.head()


Unnamed: 0.1,Unnamed: 0,imdbId,Title,Genre,Poster,Summary,director1,director2,writer1,writer2,cast1,cast2,cast3,cast4,cast5,producer1,producer2,composer1,composer2,Year
0,0,114709,Toy Story (1995),Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...,A little boy named Andy loves to be in his roo...,nm0005124,,nm0005124,nm0230032,nm0000158,nm0000741,nm0725543,nm0001815,nm0001728,nm0036366,nm0146216,nm0005271,,1995
1,1,113497,Jumanji (1995),Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...,After being trapped in a jungle board game for...,nm0002653,,nm0378144,nm0852430,nm0000245,nm0404993,nm0000379,nm0682300,nm0001372,nm0181202,nm0276059,nm0000035,,1995
2,2,113228,Grumpier Old Men (1995),Comedy|Romance,https://images-na.ssl-images-amazon.com/images...,Things don't seem to change much in Wabasha Co...,nm0222043,,nm0425756,nm0425756,nm0000527,nm0000493,nm0000047,nm0000268,nm0580565,nm0075828,nm0204862,nm0006293,,1995
3,3,114885,Waiting to Exhale (1995),Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...,This story based on the best selling novel by ...,nm0001845,,nm0573334,nm0573334,nm0001365,nm0000291,nm0222643,nm0005375,nm0002138,nm0060103,nm0433845,nm0004892,,1995
4,4,113041,Father of the Bride Part II (1995),Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...,"In this sequel to ""Father of the Bride"", Georg...",nm0796124,,nm0352443,nm0329304,nm0000188,nm0000473,nm0001737,nm0931090,nm0627624,nm0062071,nm0088692,nm0006293,,1995


In [None]:

#all_data = pd.merge(data, )
#merged1 = pd.merge(tmdb, imdb_info, how='left', on='tmdb_id')
#merged1.head()


