# IMDB Datasets

**Purpose:**

Working with "big" data in Pandas

**Data Source:**

https://datasets.imdbws.com/

In [None]:
import sys

print('Python info', sys.version)

In [None]:
import os

print('This is the curent directory', os.getcwd())

In [None]:
import datetime

current_date = datetime.date.today()
current_time = datetime.datetime.now()

print("System date/time", current_time)
print('Current date', datetime.datetime.strftime(current_date, '%A %m/%d/%Y'))
print('Current time', datetime.datetime.strftime(current_time, '%I:%M:%S %p'))

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

print('Pandas version', pd.__version__)
print('Numpy version', np.__version__)
print('Matplotlib version', mpl.__version__)

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('max_rows', 250)
pd.set_option('max_columns', 250)

from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 5, 4
plt.style.use('ggplot')

In [None]:
# read directly from a compressed file

fname = 'https://datasets.imdbws.com/title.basics.tsv.gz'

title_df = pd.read_table(filepath_or_buffer=fname,
                         sep='\t', na_values='\\N', encoding='utf8',
                         dtype={'startYear':str, 'endYear':str})

In [None]:
title_df.info()

In [None]:
title_df.head()

In [None]:
title_df.isna().sum()

In [None]:
title_df[title_df.primaryTitle.isna()]

In [None]:
title_df.dropna(subset=['primaryTitle', 'originalTitle', 'startYear'], inplace=True)

In [None]:
title_df.info()

In [None]:
title_df.isAdult.value_counts(dropna=False)

In [None]:
title_df = title_df[title_df.isAdult == 0]

In [None]:
title_df.titleType.value_counts(dropna=False)

In [None]:
title_df.genres.value_counts(dropna=False)

In [None]:
title_df['genres'] = title_df.genres.str.split(',')

In [None]:
title_df.head()

In [None]:
title_df = title_df.explode('genres').reset_index()
title_df.head()

In [None]:
title_df.info()

In [None]:
title_df.drop_duplicates(subset='tconst', keep='first', inplace=True)
title_df.reset_index(inplace=True)
title_df.info()

In [None]:
title_df.genres.value_counts(dropna=False)

## Webscraping with Beautiful Soup

In [None]:
import requests
from bs4 import BeautifulSoup

from pprint import pprint
import re

In [None]:
# find the top 250 movies

URL = 'https://m.imdb.com/chart/top'
print(URL)

In [None]:
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'lxml')

soup.title

In [None]:
soup.h1

In [None]:
data = []

for link in soup.find_all("a"):
    data.append(link.get("href", 'Empty'))
    
text_data = " ".join(data)

In [None]:
# finding the title index pattern (tconst)

movie_titles = re.findall(pattern="(tt\d+)", string=text_data)

movie_titles = set(movie_titles)

pprint(movie_titles, compact=True, width=80)
print()
print(len(movie_titles))

In [None]:
top_movies = title_df.query("tconst in @movie_titles")
top_movies

In [None]:
top_movies.genres.value_counts().sort_values().plot.barh(figsize=(10,6));

In [None]:
top_movies.groupby(['startYear','genres'])['tconst'].count().unstack('genres').plot.bar(stacked=True, figsize=(18,6))
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0);

In [None]:
top_movies.query("startYear == '1995'")