# Workplace setup

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../input/popular-movies-and-tv-shows-amazon-prime-netflix/Popular Movies TV shows from Prime Videos Netflix version_3.csv")
df.head()

In [None]:
df.info()

In [None]:
# From these values, Rating column seems to be defining the age restriction(recommendation) rather than a movie rating(these ratings are defined in the IMDb and Rotten Tomatoes Columns)

df.Rating.unique()

In [None]:
# There seems to be a large amount of nan values in the Rating column, therefore we should see through the potential ratings(it can be converted to either 'all' or avarege value)

df.Rating.isna().sum()

In [None]:
# The genres are the same, so nan values are equal to missing values rather than 'all' category

dfUnrated = df[df.Rating.isna()]
print("Unique genres overall:", df.Genre.unique())
print("Unique genres overall:", dfUnrated.Genre.unique())

In [None]:
# With numerical values we can make more accurate analysis

convert = {'18+': 18, '13+': 13, '7+': 7, '16+': 16, 'all': 0}
df.Rating = df.Rating.map(convert)
df.Rating.unique()

In [None]:
# The dataset seems to be made by only movies and TV shows that are on either Netflix or Amazon Prime, so for a better understanding we can have a column, that specifies the streaming service rather than have 2 columns for this value 

tempDF = df[(df.Netflix == 1) & (df["Amazon Prime Video"] == 1)]
print(f'On both services: {len(tempDF)}')
tempDF = df[(df.Netflix == 0) & (df["Amazon Prime Video"] == 0)]
print(f'On neither of these services: {len(tempDF)}')



In [None]:
streamingService = []

for i in df.Netflix:
    if i == 1:
        streamingService.append("Netflix")
    else:
        streamingService.append("Amazon Prime Video")

df['StreamingService'] = streamingService
del df['Netflix']
del df['Amazon Prime Video']

In [None]:
# Most of these entries are from Amazon Prime Video, but we can still do some analytics for Netflix as well

from collections import Counter

counter = Counter(df.StreamingService)
print(counter.most_common())

In [None]:
# There is a d;} value in the IMDb column, which seems to be a missing value or some kind entry 

df.IMDb.unique()

In [None]:
# The rating of d;} doesn't occour often, therefore it is best to replace the value with nan 

print(len(df[df.IMDb == 'd;}']))

df['IMDb'] = df['IMDb'].replace(to_replace='d;}', value=np.nan)

print(len(df[df.IMDb == 'd;}']))

In [None]:
# Ratings on Rotten Tomtoes and IMDb were object entries, converting them to floats will enable further analysis

df.IMDb = df.IMDb.astype(float)

In [None]:
df['IMDb'] = df['IMDb'].replace(to_replace=np.nan, value=np.mean(df['IMDb']))


In [None]:
df.info()

In [None]:
df['Rotten Tomatoes'] = df['Rotten Tomatoes'].replace(to_replace='na', value=np.nan)

df['Rotten Tomatoes'] = df['Rotten Tomatoes'].astype(float)

df['Rotten Tomatoes'] = df['Rotten Tomatoes'].replace(to_replace=np.nan, value=np.mean(df['Rotten Tomatoes']))


In [None]:
#Netflix shows and movies seem to have a better overall rating on both platforms 

avarage = df[df.StreamingService == 'Netflix'].IMDb.mean() 
print(f'Avarage score for a Netflix shows is(IMDb): {round(avarage, 2)}')
avarage = df[df.StreamingService == 'Amazon Prime Video'].IMDb.mean() 
print(f'Avarage score for an Amazon shows is(IMDb): {round(avarage, 2)}')

avarage = df[df.StreamingService == 'Netflix']['Rotten Tomatoes'].mean() 
print(f'Avarage score for a Netflix shows is(Rotten Tomatoes): {round(avarage, 2)}')
avarage = df[df.StreamingService == 'Amazon Prime Video']['Rotten Tomatoes'].mean() 
print(f'Avarage score for an Amazon shows is(Rotten Tomatoes): {round(avarage, 2)}')


# Analytics

In [None]:
# In the next two tables, we can see that these two pages have users with absolutely different tastes. 
# In the top 10, there isn't even one movie or show that is in both pages

df.sort_values(by=['IMDb'], ascending=False).head(10)

In [None]:
# On IMDb, the top movies are usually Documentaries, while on Rotten Tomatoes the Leading Genre seems to be Action & Adventure
# This suggest that Rotten Tomatoes is used mostly by younger generations
# The high ranked movies on Rotten Tomatoes have usually good rankings in Imdb as well while the top movies on IMDb have awfull ratings on Rotten Tomatoes

df.sort_values(by=['Rotten Tomatoes'], ascending=False).head(10)

In [None]:
# From this scatterplot, the ratings seem really chaotic, with this information, we've decided on using these ratings seperately    

sns.scatterplot(df.IMDb, df['Rotten Tomatoes'], color='red')

In [None]:
# In these genres, there is also US and Japanese animation, which are not genres, but can be used for analytics

df.Genre.unique()

In [None]:
# Anime has got only 16 titles, therefore there isn't a point in doing any analytics with it

for i in df.Genre.unique():
    print("Number of titles in " + i + ": " + str(len(df[df.Genre == i])))

In [None]:
df = df[df.Genre != "Anime"]

In [None]:
df.info()

In [None]:
# When it comes to IMDb, older movies are getting a really high ratings, on the other hand, on Rotten Tomatoes, this pattern seems to be opposite and older movies are getting worse ratings

sns.scatterplot('Year', 'IMDb', data=df)
plt.show()
sns.scatterplot('Year', 'Rotten Tomatoes', data=df)
plt.show()

In [None]:
# Preferance in age ratings is slightly different depending on site 

print("Ratings IMDb")
print()
for i in df[df.Rating != np.nan].Rating.unique():
    print("Avarage rating for category " + str(i) + "+ is: " + str(np.average(df[df.Rating == i].IMDb)))

In [None]:
# Ratings on Rotten Tomatoes seem to be little different in some categories

print("Ratings Rotten Tomatoes")
print()
for i in df[df.Rating != np.nan].Rating.unique():
    print("Avarage rating for category " + str(i) + "+ is: " + str(np.average(df[df.Rating == i]['Rotten Tomatoes'])))

In [None]:
# Finally, the list of top movies regarding their genres on both platforms
print("Top ratings Rotten Tomatoes")
print()

for genre in df.Genre.unique():

    print(genre)
    print("Top movies:")
    for i in df[df.Genre == genre].sort_values(by=['Rotten Tomatoes'], ascending=False).Title[:10]:
        rating = np.mean(df[df.Title == i]['Rotten Tomatoes'])
        print(i + ", " + str(rating))
        
    print()

In [None]:
print("Top movies IMDb")
print()

for genre in df.Genre.unique():
    print(genre)
    print("Top movies:")
    for i in df[df.Genre == genre].sort_values(by=['IMDb'], ascending=False).Title[:10]:
        rating = np.mean(df[df.Title == i]['IMDb'])
        print(i + ", " + str(rating))
        
    print()