Download and extract data

In [24]:
import urllib.request
import tarfile
import os

if os.path.isdir('MovieSummaries') == False:
    print("Downloading and extracting data...")
    data_tar_link = "http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz"
    ftpstream = urllib.request.urlopen(data_tar_link)

    data_tar_file = tarfile.open(fileobj=ftpstream, mode="r|gz")
    data_tar_file.extractall()
    
    print("Dataset downloaded and extracted.")
else:
    print("Dataset already downloaded")

print("Data size: ", sum(os.path.getsize(os.path.join('MovieSummaries', f)) / (1024.0*1024.0)
          for f in os.listdir('MovieSummaries')
          if os.path.isfile(os.path.join('MovieSummaries', f))), "MB")
# this should take less than two minutes (depends mostly on connection speed), if it takes longer 
# check if MovieSummaries directory is created in working directory and some files are in it

Downloading and extracting data...
Dataset downloaded and extracted.
Data size:  127.5914134979248 MB


In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
header_metadata = ['wikiID', 'freebaseID', 'name', 'release_date', 'box_office', 'runtime', 'language', 'country', 'genre']
movie_metadata = pd.read_csv('./MovieSummaries/movie.metadata.tsv', sep='\t', names=header_metadata)

In [27]:
movie_metadata.head()

Unnamed: 0,wikiID,freebaseID,name,release_date,box_office,runtime,language,country,genre
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


Maknimo nepotrebne retke

In [28]:
movie_metadata = movie_metadata.drop(columns=['freebaseID', 'release_date', 'box_office', 'runtime', 'country'])

In [29]:
movie_metadata.head()

Unnamed: 0,wikiID,name,language,genre
0,975900,Ghosts of Mars,"{""/m/02h40lc"": ""English Language""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,"{""/m/02h40lc"": ""English Language""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,Brun bitter,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,White Of The Eye,"{""/m/02h40lc"": ""English Language""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,A Woman in Flames,"{""/m/04306rv"": ""German Language""}","{""/m/07s9rl0"": ""Drama""}"


Pretvorimo genre stupac tako da mu elementi sadrze skupove zanrova, a ne json format s IDevima

In [30]:
import json

set(json.loads(movie_metadata['genre'][0]).values())

{'Action',
 'Adventure',
 'Horror',
 'Science Fiction',
 'Space western',
 'Supernatural',
 'Thriller'}

In [31]:
movie_metadata['genre'] = movie_metadata['genre'].apply(lambda x: set(json.loads(x).values()))

In [32]:
movie_metadata.head()

Unnamed: 0,wikiID,name,language,genre
0,975900,Ghosts of Mars,"{""/m/02h40lc"": ""English Language""}","{Thriller, Supernatural, Science Fiction, Horr..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,"{""/m/02h40lc"": ""English Language""}","{Drama, Crime Drama, Mystery, Biographical film}"
2,28463795,Brun bitter,"{""/m/05f_3"": ""Norwegian Language""}","{Drama, Crime Fiction}"
3,9363483,White Of The Eye,"{""/m/02h40lc"": ""English Language""}","{Erotic thriller, Psychological thriller, Thri..."
4,261236,A Woman in Flames,"{""/m/04306rv"": ""German Language""}",{Drama}


Postoje filmovi na raznim jezicima, ali su sazetci radnje filma preuzeti s engleske wikipedije stoga je to OK.

Pogledajmo kakvi sve zanrovi filmova postoje u datasetu

In [33]:
all_genres = {genre for genre_set in movie_metadata['genre'] for genre in genre_set}

In [34]:
len(all_genres)

363

In [35]:
from collections import Counter

count_genres = Counter([genre for genre_set in movie_metadata['genre'] for genre in genre_set])

In [36]:
count_genres.most_common(50)

[('Drama', 34007),
 ('Comedy', 16349),
 ('Romance Film', 10234),
 ('Black-and-white', 9094),
 ('Action', 8798),
 ('Thriller', 8744),
 ('Short Film', 8141),
 ('World cinema', 7155),
 ('Crime Fiction', 6948),
 ('Indie', 6897),
 ('Documentary', 5630),
 ('Horror', 5280),
 ('Silent film', 5250),
 ('Adventure', 4963),
 ('Family Film', 4598),
 ('Action/Adventure', 4561),
 ('Comedy film', 4362),
 ('Musical', 4160),
 ('Animation', 3534),
 ('Romantic drama', 3372),
 ('Mystery', 3195),
 ('Science Fiction', 3052),
 ('Fantasy', 2820),
 ('Romantic comedy', 2680),
 ('War film', 2652),
 ('Japanese Movies', 2322),
 ('Western', 2225),
 ('Crime Thriller', 2122),
 ('Period piece', 1758),
 ('Comedy-drama', 1720),
 ('Film adaptation', 1531),
 ('Chinese Movies', 1525),
 ('Biography', 1392),
 ('Psychological thriller', 1331),
 ('Bollywood', 1321),
 ('Sports', 1206),
 ('Biographical film', 1198),
 ('LGBT', 1163),
 ('Music', 1088),
 ('Family Drama', 1043),
 ('Black comedy', 1025),
 ('Parody', 999),
 ('Televisio