# Comparison of content between Netflix and Amazon Prime
## 1. Introduction
The online streaming domain has been heating up with the entry of Disney+, Apple, HBO Max and NBC Peacock. However, Netflix remains the biggest player in the market with Amazon Prime Video trailing behind it. 

In this project I have compared the content between Amazon and Netflix to gather insights into this online streaming war.

In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import seaborn as sns
import json
import time

## Data Gathering
The catalog for both Amazon Prime Video and Netflix have been scraped from reelgood.com which is an online streaming aggregator and helps one watch all the online content in one place. <br>
The genre of the TV shows and movies have been scraped from Finder.com which is a service used for comparing products such as cred cards, martgages.

In [22]:
# Extracting the Netflix content from reelgood.com using Beautiful Soup
start= time.time()

for page in range(0,5801,50): # List of all the pages in the website
    
    print(page, end=',') # for telling the status of the current iteration
    
    time.sleep(np.random.randint(5,25))
    
    try:
        
        # URL for the reelgood website
        url = 'https://reelgood.com/source/netflix?offset='+str(page)

        # Extracting the HTML elements with Beautiful soup
        response= requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser' )

        # Finding the number of titles in the extracted page
        page_length= len(soup.find_all('td', class_="css-1u7zfla e126mwsw1"))

        # Initiating empty lists to make the dataframe
        title= []
        year = []
        age_group= []
        imdb= []
        rt=[]

        for i in range(page_length):
            # extracting the title from the soup element
            title.append(soup.find_all('td', class_="css-1u7zfla e126mwsw1")[i].find('a').contents[0])
            
            # extracting the year information from the soup element
            year.append(soup.find_all('td', class_="css-1u11l3y")[4*i].contents[0])
            
            # extracting the age group detail from the soup element
            age_group.append(soup.find_all('td', class_="css-1u11l3y")[4*i+1].contents[0])
            
            # extracting the imdb rating from the soup element
            imdb.append(soup.find_all('td', class_="css-1u11l3y")[4*i+2].contents[0])
            
            # extracting the rotten tomatoes rating from the soup element
            rt.append(soup.find_all('td', class_="css-1u11l3y")[4*i+3].contents[0])
            
            # forming a dataframe for each iteration
            df_temp = pd.DataFrame({'title':title,'year':year,'age_group':age_group, 'imdb':imdb,'rotten_tomato':rt} )

        if page==0:
            df = df_temp
            
        else:
            df = pd.concat([df, df_temp]) # appending the dataframe for each iteration
            
    except:
        print('Error on page:',page)
        continue

end= time.time()

print(round(end-start,0),'s')
df.shape

0,50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000,1050,1100,1150,1200,1250,1300,1350,1400,1450,1500,1550,1600,1650,1700,1750,1800,1850,1900,1950,2000,2050,2100,2150,2200,2250,2300,2350,2400,2450,2500,2550,2600,2650,2700,2750,2800,2850,2900,2950,3000,3050,3100,3150,3200,3250,3300,3350,3400,3450,3500,3550,3600,3650,3700,3750,3800,3850,3900,3950,4000,4050,4100,4150,4200,4250,4300,4350,4400,4450,4500,4550,4600,4650,4700,4750,4800,4850,4900,4950,5000,5050,5100,5150,5200,5250,5300,5350,5400,5450,5500,5550,5600,5650,5700,5750,5800,2185.0 s


(5804, 5)

In [23]:
#Exporting the data to local hard drive
df.to_csv(r'C:\Users\srini\Projects\Online Streaming\netflix_shows.csv', index=False)

In [27]:
start= time.time()

for page in range(0,15651,50): # List of all the pages in the website
    
    print(page, end=',')
    
    time.sleep(np.random.randint(8,14))
    
    try:
        
        # URL for the reelgood website
        url = 'https://reelgood.com/source/amazon?offset='+str(page)

        # Extracting the HTML elements with Beautiful soup
        response= requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser' )

        # Finding the number of titles in the extracted page
        page_length= len(soup.find_all('td', class_="css-1u7zfla e126mwsw1"))

        # Initiating empty lists to make the dataframe
        title= []
        year = []
        age_group= []
        imdb= []
        rt=[]

        for i in range(page_length):
            # extracting the title from the soup element
            title.append(soup.find_all('td', class_="css-1u7zfla e126mwsw1")[i].find('a').contents[0])
            
            # extracting the year information from the soup element
            year.append(soup.find_all('td', class_="css-1u11l3y")[4*i].contents[0])
            
            # extracting the age group detail from the soup element
            age_group.append(soup.find_all('td', class_="css-1u11l3y")[4*i+1].contents[0])
            
            # extracting the imdb rating from the soup element
            imdb.append(soup.find_all('td', class_="css-1u11l3y")[4*i+2].contents[0])
            
            # extracting the rotten tomatoes rating from the soup element
            rt.append(soup.find_all('td', class_="css-1u11l3y")[4*i+3].contents[0])
            
            # forming a dataframe for each iteration
            df_temp = pd.DataFrame({'title':title,'year':year,'age_group':age_group, 'imdb':imdb,'rotten_tomato':rt} )

        if page==0:
            df = df_temp
            
        else:
            df = pd.concat([df, df_temp])
            
    except:
        print('Error on page:',page)
        continue

end= time.time()

print(round(end-start,0),'s')
df.shape

#Exporting the data to local hard drive
df.to_csv(r'C:\Users\srini\Projects\Online Streaming\amazon.csv', index=False)

0,50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000,1050,1100,1150,1200,1250,1300,1350,1400,1450,1500,1550,1600,1650,1700,1750,1800,1850,1900,1950,2000,2050,2100,2150,2200,2250,2300,2350,2400,2450,2500,2550,2600,2650,2700,2750,2800,2850,2900,2950,3000,3050,3100,3150,3200,3250,3300,3350,3400,3450,3500,3550,3600,3650,3700,3750,3800,3850,3900,3950,4000,4050,4100,4150,4200,4250,4300,4350,4400,4450,4500,4550,4600,4650,4700,4750,4800,4850,4900,4950,5000,5050,5100,5150,5200,5250,5300,5350,5400,5450,5500,5550,5600,5650,5700,5750,5800,5850,5900,5950,6000,6050,6100,6150,6200,6250,6300,6350,6400,6450,6500,6550,6600,6650,6700,6750,6800,6850,6900,6950,7000,7050,7100,7150,7200,7250,7300,7350,7400,7450,7500,7550,7600,7650,7700,7750,7800,7850,7900,7950,8000,8050,8100,8150,8200,8250,8300,8350,8400,8450,8500,8550,8600,8650,8700,8750,8800,8850,8900,8950,9000,9050,9100,9150,9200,9250,9300,9350,9400,9450,9500,9550,9600,9650,9700,9750,9800,9850,9900,9950,10000,10050,10100,10150

In [28]:
#Returns the movie/tv show genre and other details from finder.com

def genre_extract(url):
    """ Returns the movie/tv show genre and other details from finder.com
    args- url of finder.com
    output: dataframe with the movie/tv show information"""
    response= requests.get(url)
    return pd.read_html(response.content)[0]

In [29]:
# Extracting information for Netflix TV shows
df_netflix_tv= genre_extract('https://www.finder.com/netflix-tv-shows')

# Extracting information for Netflix movies
df_netflix_movie = genre_extract('https://www.finder.com/netflix-movies')

# Extracting information for Amazon Movies
df_amazon_movie = genre_extract('https://www.finder.com/amazon-prime-movies')

# Extracting information for Amazon TV shows
df_amazon_tv = genre_extract('https://www.finder.com/amazon-prime-tv-shows')

## 3. Data Wrangling
### 3.1 Merging dataframes
Combining Netflix and Amazon into a common dataframe to help with analysis

In [30]:
# Retriving the data from local hard drive
df_netflix= pd.read_csv(r'C:\Users\srini\Projects\Online Streaming\netflix_shows.csv')
df_amazon= pd.read_csv(r'C:\Users\srini\Projects\Online Streaming\amazon.csv')

In [31]:
df_netflix.head()

Unnamed: 0,title,year,age_group,imdb,rotten_tomato
0,Breaking Bad,2008,18+,9.5,96%
1,Inception,2010,13+,8.8,87%
2,Back to the Future,1985,7+,8.5,96%
3,The Matrix,1999,18+,8.7,88%
4,The Silence of the Lambs,1991,18+,8.6,96%


In [33]:
# Adding a column to indicate the streaming platform
df_netflix['streaming']= 'Netflix'
df_amazon['streaming']= 'Amazon'

In [34]:
df_amazon.head()

Unnamed: 0,title,year,age_group,imdb,rotten_tomato,streaming
0,The Silence of the Lambs,1991,18+,8.6,96%,Amazon
1,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,Amazon
2,The Pianist,2002,18+,8.5,95%,Amazon
3,The Avengers,2012,13+,8.0,92%,Amazon
4,Knives Out,2019,13+,7.9,97%,Amazon


In [35]:
# combining the dataframes
df= pd.concat([df_netflix, df_amazon])
df.head()

Unnamed: 0,title,year,age_group,imdb,rotten_tomato,streaming
0,Breaking Bad,2008,18+,9.5,96%,Netflix
1,Inception,2010,13+,8.8,87%,Netflix
2,Back to the Future,1985,7+,8.5,96%,Netflix
3,The Matrix,1999,18+,8.7,88%,Netflix
4,The Silence of the Lambs,1991,18+,8.6,96%,Netflix


In [36]:
df.shape

(21504, 6)

### 3.2 Finding duplicates


In [40]:
# finding duplicate values
df.duplicated().sum()

2

In [41]:
# finding duplicate values
df[df.duplicated()]

Unnamed: 0,title,year,age_group,imdb,rotten_tomato,streaming
3938,El día menos pensado,2020,,7.3,,Netflix
4612,Lucid Dream,2017,,6.1,,Netflix


In [43]:
# removing the duplicate values
df.drop_duplicates(inplace= True)

In [44]:
# Checking
df.duplicated().sum()

0

### 3.3 Resetting the index
Since we concatenated 2 dataframes we need to remove the duplicate indices.

In [48]:
df.reset_index(inplace= True)
df.head(1)

Unnamed: 0,index,title,year,age_group,imdb,rotten_tomato,streaming
0,0,Breaking Bad,2008,18+,9.5,96%,Netflix


In [49]:
df.drop(columns='index', inplace= True)
df.head(1)

Unnamed: 0,title,year,age_group,imdb,rotten_tomato,streaming
0,Breaking Bad,2008,18+,9.5,96%,Netflix


### 3.4 Changing data type for  rotten tomatoes columns
Changing the rotten tomatoes columns to float values.

In [57]:
def rt_float_extract(x):
    """Function to extract the digits from the Rotten Tomatoes column
    Input: single rotten tomatoe rating value
    Output: float value"""
    try:
        temp= float(x[:2])
    except:
        temp= np.nan
    return temp

In [59]:
# extracting the digits from the Rotten Tomatoes column
df.rotten_tomato= df.rotten_tomato.apply(lambda x: rt_float_extract(x) )

In [60]:
df.head(2)

Unnamed: 0,title,year,age_group,imdb,rotten_tomato,streaming
0,Breaking Bad,2008,18+,9.5,96.0,Netflix
1,Inception,2010,13+,8.8,87.0,Netflix


In [61]:
df.age_group.unique()

array(['18+', '13+', '7+', '16+', nan, 'all'], dtype=object)