## DS 4200 - Analyzing Netflix Trends Through IMDB Scores and Entertainment Characteristics

### Alissa Agnelli, Paulina Acosta, Regina Rabkina

In [91]:
# Importing libraries
import pandas as pd
import numpy as np 
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns

In [92]:
# Read the CSV file 
imdb_df = pd.read_csv('../datasets/Netflix TV Shows and Movies.csv')
netflix_df = pd.read_csv('../datasets/netflix_titles.csv', encoding='ISO-8859-1')

In [93]:
# Looking at variables' datatypes for imdb dataframe
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5283 entries, 0 to 5282
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              5283 non-null   int64  
 1   id                 5283 non-null   object 
 2   title              5283 non-null   object 
 3   type               5283 non-null   object 
 4   description        5278 non-null   object 
 5   release_year       5283 non-null   int64  
 6   age_certification  2998 non-null   object 
 7   runtime            5283 non-null   int64  
 8   imdb_id            5283 non-null   object 
 9   imdb_score         5283 non-null   float64
 10  imdb_votes         5267 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 454.1+ KB


In [94]:
# Looking at variables' datatypes for netflix dataframe
netflix_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8809 entries, 0 to 8808
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   show_id       8809 non-null   object 
 1   type          8809 non-null   object 
 2   title         8809 non-null   object 
 3   director      6175 non-null   object 
 4   cast          7984 non-null   object 
 5   country       7978 non-null   object 
 6   date_added    8799 non-null   object 
 7   release_year  8809 non-null   int64  
 8   rating        8805 non-null   object 
 9   duration      8806 non-null   object 
 10  listed_in     8809 non-null   object 
 11  description   8809 non-null   object 
 12  Unnamed: 12   0 non-null      float64
 13  Unnamed: 13   0 non-null      float64
 14  Unnamed: 14   0 non-null      float64
 15  Unnamed: 15   0 non-null      float64
 16  Unnamed: 16   0 non-null      float64
 17  Unnamed: 17   0 non-null      float64
 18  Unnamed: 18   0 non-null    

### EDA

In [95]:
# Selecting relevant columns
netflix_df = netflix_df[['title', 'country', 'date_added', 'release_year', 'duration', 'listed_in', 'rating']]

We decided to select these columns from the netflix dataframe because we deemed these most significant for our analysis of Netflix trends.

In [96]:
# Merging the two dataframe based on matching titles and release years
data = pd.merge(imdb_df, netflix_df, on=['title', 'release_year'], how='inner')

In [97]:
# Dropping columns unrelated to our analysis
data = data.drop(['id', 'imdb_id', 'index', 'description', 'age_certification', 'duration'], axis = 1)

#### Ensuring appropriate data types

In [98]:
# Merged the data sets, kept columns that are relevant to analysis
# Now going to make sure the data types are correct

data['date_added'] = pd.to_datetime(data['date_added'], format='%B %d, %Y', errors='coerce')
data['type'] = data['type'].astype('category')
data['imdb_votes'] = data['imdb_votes'].astype('Int64')
data['rating'] = data['rating'].astype('category')
data['country'] = data['country'].astype('category')
data['listed_in'] = data['listed_in'].astype('category')


In [99]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2984 entries, 0 to 2983
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   title         2984 non-null   object        
 1   type          2984 non-null   category      
 2   release_year  2984 non-null   int64         
 3   runtime       2984 non-null   int64         
 4   imdb_score    2984 non-null   float64       
 5   imdb_votes    2979 non-null   Int64         
 6   country       2742 non-null   category      
 7   date_added    2984 non-null   datetime64[ns]
 8   listed_in     2984 non-null   category      
 9   rating        2984 non-null   category      
dtypes: Int64(1), category(4), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 205.5+ KB


In [100]:
data.isna().sum()

title             0
type              0
release_year      0
runtime           0
imdb_score        0
imdb_votes        5
country         242
date_added        0
listed_in         0
rating            0
dtype: int64

#### Handling null values

In [101]:
# Fill imdb_votes with median
data['imdb_votes'].fillna(data['imdb_votes'].median(), inplace=True)
data.isna().sum()

title             0
type              0
release_year      0
runtime           0
imdb_score        0
imdb_votes        0
country         242
date_added        0
listed_in         0
rating            0
dtype: int64

Since votes tend to be skewed (some movies get millions, others get a few), using the median value is safer than the mean. So, we will fill the missing imdb_votes with the median value.

Add "Unknown" to the category list before filling missing values

In [102]:
# Add "Unknown" as a valid category
data['country'] = data['country'].cat.add_categories("Unknown")

In [103]:
# Fill country with "Unknown"
data['country'].fillna("Unknown", inplace=True)
data.isna().sum()

title           0
type            0
release_year    0
runtime         0
imdb_score      0
imdb_votes      0
country         0
date_added      0
listed_in       0
rating          0
dtype: int64

We replaced null values with "Unknown" to indicate missing country data. We do not want to remove certain titles if country data is missing, but we may also want to explore regional trends.

### Visualizations

#### IMDB Score Trends Over Time - Altair Line Chart
We want to analyze Netflix's content strategy. By looking at when Netflix added the title to its platform, we can see if Netflix is acquiring more high-rated content overtime. We can also see if certain years had a surge in high or low-rated content being added.

This interactive visualization enhances data exploration by allowing users to engage with IMDB score trends dynamically. 

When hovering over a specific year, the nearest data point is highlighted, making it easier to see how Netflix’s average IMDB scores have changed over time. The tooltip provides exact values for each year, ensuring that stakeholders can quickly access precise information without manually interpreting the chart. Additionally, the interactive highlighting feature ensures that trends are easily distinguishable, improving readability compared to a static plot.

In [None]:
data['year_added'] = data['date_added'].dt.year

# Aggregate average IMDB scores per year
df_avg = data.groupby('year_added', as_index=False)['imdb_score'].mean()

# Selection for interaction
selection = alt.selection_point(fields=['year_added'], nearest=True, on='mouseover', empty='none')

# Base line chart
score_chart = alt.Chart(df_avg).mark_line(point=True,color='#247ba0').encode(
    x=alt.X('year_added:O', title="Year Content Added to Netflix"),
    y=alt.Y('imdb_score:Q', title="Average IMDB Score"),
    tooltip=['year_added', 'imdb_score']
).properties(
    title="IMDB Score Trends for Content Added to Netflix"
)

# Highlight points on hover 
points = score_chart.mark_circle(size=100, color='#247ba0').encode(
    opacity=alt.condition(selection, alt.value(1), alt.value(0))
).add_params(selection) 
# Combine the chart
interactive_chart = (score_chart + points).interactive()
interactive_chart

#interactive_chart.save('imdb_score_trends.html')

### IMDB Score Trends Over Time: Insights into Netflix's Content Strategy
The trends in IMDB scores for Netflix's content over the years reveal several key insights into the company’s evolving content strategy:

* Early Focus on Premium Content (2009–2010): In its early years, Netflix prioritized acquiring high-rated content, indicating a strategy aimed at attracting discerning viewers and establishing itself as a premium service. This focus on quality helped differentiate Netflix from competitors and likely contributed to building initial brand loyalty.

* Shift to Volume and Broad Appeal (2011–2015): The decline in IMDB scores during 2011 and again in 2015 suggests that Netflix strategically expanded its library by adding a higher volume of content to remain competitive as new streaming platforms emerged. This shift was likely driven by the need to offer a broader range of options, attracting a wider audience even if it meant sacrificing some content quality.

* Recovery with a Focus on High-Quality Originals (2013–2014): Between 2013 and 2014, the rebound in IMDB scores indicates a strategic pivot towards acquiring or producing higher-rated content. Netflix began investing heavily in original programming, with shows like House of Cards and Orange is the New Black, marking its shift toward exclusive, high-quality offerings that would become a cornerstone of its strategy.

* Response to Growing Competition (2015): The drop in scores during 2015 coincides with the rise of competing streaming platforms, suggesting that Netflix may have focused on increasing content volume to counteract competitive pressures. While this likely impacted the average content quality, it was a necessary strategy to maintain a broad catalog to attract and retain subscribers.

* Stabilization and Balanced Approach (2017–2021): From 2017 to 2021, the stabilization of IMDB scores around 6.5 reflects Netflix’s ability to balance content quality with quantity. This period demonstrates the company’s refined approach of offering a mix of original content, licensed material, and niche programming to cater to diverse viewer preferences without compromising on the overall quality of its library.

#### Takeaways:

* Netflix’s ability to adjust its content strategy in response to market pressures—such as rising competition and evolving customer expectations—demonstrates its agility. The company shifted from a high-quality, low-volume strategy to one focused on content volume, and then found a balance of both, enabling it to maintain its competitive edge.

* The evolution in Netflix’s content strategy highlights the growing importance of original programming. The rebound in IMDB scores and the subsequent stabilization were driven by Netflix’s significant investment in exclusive, high-quality shows, positioning it as a leader in premium streaming content.

* Netflix’s strategy reflects an understanding of the diverse tastes of its global audience. By offering a broad range of content while maintaining consistent quality, Netflix ensured it could satisfy both its existing subscriber base and new customers.

Ultimately, the fluctuations in IMDB scores provide valuable insights into Netflix’s strategic evolution—moving from a focus on high-rated acquisitions, to content volume expansion, and finally to a balanced strategy that combines quality with quantity. This flexibility has allowed Netflix to maintain its leadership in the competitive streaming market.

#### Relationship Between Media Runtime and IMDB Scores - Altair Box plot

We first will categorize the media runtimes into three categories: Short, Medium, and Long. Then, we want to use a box plot to see how the ratings (IMDB score) differ across these length categories.

In [105]:
# Bin runtime lengths (short: <=90, medium: <=150, long: >150)
bins = [0, 90, 150, float('inf')]  # Runtime length bins
labels = ['Short', 'Medium', 'Long']  # Labels for the categories

# Create a new column 'length_category' based on these bins
data['length_category'] = pd.cut(data['runtime'], bins=bins, labels=labels, right=False)

# Create the Altair Box Plot
box_plot = alt.Chart(data).mark_boxplot().encode(
    x=alt.X('length_category:N', title='Media Length Category'),
    y=alt.Y('imdb_score:Q', title='IMDB Score'),
    tooltip=['length_category', 'imdb_score']
).properties(
    title='Distribution of IMDB Scores by Media Length Category'
)

# Show the plot
#box_plot.save('imdb_score_by_length.png')

### Relationship Between Media Runtime and IMDB Scores

Our analysis explores whether media runtime influences IMDB ratings. By categorizing media into Short, Medium, and Long based on runtime and visualizing their ratings using a box plot, we gain insights into how different content lengths perform in terms of audience reception.

#### Takeaways:

1. Consistent Median Ratings Across Categories 
   - The median IMDB score remains relatively stable across all three categories, indicating that longer or shorter media does not inherently lead to higher or lower ratings.  

2. Ratings Are Spread Similarly for All Lengths
   - The distribution of ratings (Interquartile Range) is similar for Short, Medium, and Long media, suggesting that runtime does not significantly impact rating variability.  

3. Presence of Low-Rated Outliers Across All Categories  
   - Each category contains low-rated outliers (IMDB scores below 3), suggesting that content quality—not just runtime—plays a key role in audience perception.  

4. Slightly Higher Maximum Ratings for Short Media  
   - The highest-rated media appear more frequently in the Short category, suggesting that longer content may have a higher potential for strong audience reception. However, this difference is not drastic.  

#### Implications:  
- Runtime alone is not a strong predictor of success. Instead, other factors such as storytelling, production quality, and genre are likely more influential in determining IMDB ratings.  
- Investment in short-form content may yield higher highs, but it does not guarantee better ratings overall.  
- A diverse content strategy remains valuable, as high and low ratings exist across all runtime categories.  


#### IMDB Score Trends by Age Rating - Altair Box Plot

In [106]:
# Remove one piece of data where rating = the runtime
age_data = data[data['rating'] != '84 min']

In [107]:
# See the unique rating values
ratings_lst = list(age_data['rating'].unique())
print(ratings_lst)

['PG', 'G', 'R', 'TV-14', 'TV-MA', 'TV-G', 'TV-PG', 'PG-13', 'TV-Y7', 'TV-Y', 'NC-17', 'NR']


In [108]:
# Remove not rated content
age_data = age_data[age_data['rating'] != 'NR']

In [109]:
# Put in order so this is order for boxplots across x-axis
all_age_ratings = ['TV-Y7', 'TV-Y', 'TV-G', 'G', 'TV-PG', 'PG', 'PG-13', 'TV-14', 'R', 'TV-MA', 'NC-17']
tv_age_ratings = ['TV-Y7', 'TV-Y', 'TV-G', 'TV-PG', 'TV-14', 'TV-MA']
movie_age_ratings =  ['G', 'PG', 'PG-13', 'R', 'NC-17']

In [110]:
age_data['content_type'] = age_data['rating'].apply(lambda x: 'TV' if x in tv_age_ratings else 'Movie')

In [111]:
input_dropdown = alt.binding_select(options = ['TV', 'Movie'], 
                                   name = 'Type of Content:')

In [112]:
selection = alt.param(bind = input_dropdown, value = 'TV')

In [113]:
# Create side by side boxplots for each age rating, with IMBD score distribution
age_boxplots = alt.Chart(age_data).mark_boxplot().encode(
    x = alt.X('rating:N', sort = all_age_ratings, title = 'Rating'),
    y = alt.Y('imdb_score:Q', title = 'IMDB Score'),
    tooltip = ['rating:N', 'imdb_score:Q'],
).properties(
    title = 'Boxplot of IMDB Score by Age Rating for Netflix Content',
    width = 600,
    height = 400
).transform_filter(
    alt.datum.content_type == selection
).add_params(
    selection
)

In [114]:
# Display boxplot
age_boxplots.display()

In [115]:
#age_boxplots.save('agebox.html')

In [116]:
#age_data.to_csv('age_data.csv', index=False)

Scatterplot of Number of Release vs. Average IMDB Ratings

In [117]:
#grouping the data by release year and type
df_plot = data.groupby(['release_year', 'type']).agg(
    num_releases = ('title', 'count'), 
    avg_rating = ('imdb_score', 'mean')
).reset_index()

In [118]:
#grouping the data by release year and type
df_plot = data.groupby(['release_year', 'type']).agg(
    num_releases = ('title', 'count'), 
    avg_rating = ('imdb_score', 'mean')
).reset_index()

In [119]:
#scatter plot of releases per year and imdb rating

scatter_plot = alt.Chart(df_plot).mark_circle().encode(
    x=alt.X('release_year:O', title='Release Year', sort='ascending'),
    y=alt.Y('avg_rating:Q', title='Average IMDb Rating', scale=alt.Scale(domain=[df_plot['avg_rating'].min()-0.5, df_plot['avg_rating'].max()+0.5])),
    size=alt.Size('num_releases:Q', title='Number of Releases', scale=alt.Scale(range=[10, 300])),
    color=alt.Color('type:N', title='Type', scale=alt.Scale(domain=['MOVIE', 'SHOW'], range=['blue', 'red'])),
    tooltip=['release_year', 'num_releases', 'avg_rating', 'type']
).properties(
    title="Number of Releases vs. Average Rating by Year (Movies & TV Shows)",
    width=800,
    height=500
)

scatter_plot.show()

Scatterplot of IMDB Ratings vs. Number of Reviews

In [None]:
df_reviews = data[['title', 'imdb_score', 'imdb_votes', 'type']].dropna()  # Ensure 'type' column is included

# Create a selection object to enable click and highlight functionality
selection = alt.selection_interval(encodings=['x'])

# Create the scatterplot
review_scatterplot = (
    alt.Chart(df_reviews)
    .mark_circle()
    .encode(
        x=alt.X("imdb_votes:Q", title="Number of Reviews", scale=alt.Scale(type="log")),  # Log scale for better visualization
        y=alt.Y("imdb_score:Q", title="IMDb Rating"),
        color=alt.Color("type:N", title="Type", scale=alt.Scale(scheme='category10')),  # Different colors for Movie/TV Show
        opacity=alt.condition(selection, alt.value(1), alt.value(0.1))
    )
    .add_params(selection)
    .properties(width=600, height=400, title="IMDb Ratings vs. Number of Reviews")
)

#review_scatterplot.save('imdb_ratings_vs_reviews.html')