In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Word embedding imports.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Default imports.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualization imports.
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Graphing import.
import networkx as nx

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Functions

In [None]:
def process_dates(df):
    """Processes date information into a Series of tuples where the format is (month, year). """
    
    MONTH_INTS = {'January':  1, 'February':  2, 'March':      3, 
                  'April':    4, 'May':       5, 'June':       6, 
                  'July':     7, 'August':    8, 'September':  9, 
                  'October': 10, 'November': 11, 'December':  12,
    }
    
    date_values = []
    for value_ls in list(df['date_added']):
        if (len(value_ls) == 2):
            # Re-joins 'Not Specified' back into a single string.
            value = ' '.join(value_ls)
        elif (len(value_ls) == 3):
            # Converts date into a tuple of ints (month, year).
            value = (int(MONTH_INTS[value_ls[0]]), int(value_ls[2]),)
        elif (len(value_ls) == 4):
            # Converts date into a tuple of ints (month, year).
            value = (int(MONTH_INTS[value_ls[1]]), int(value_ls[3]))
            
        date_values.append(value)

    return pd.Series(date_values)

In [None]:
def value_counter(df, keyword):
    """Processes the specified column into a Series of unique value counts. """
    
    values = []
    for value_ls in list(df[keyword]):
        for value in value_ls:
            values.append(value)

    return pd.Series(values)

In [None]:
class Recommender():
    def __init__(self, df):
        """Initializes and executes TF-IDF. """
        
        # Save pointer to the specified dataframe.
        self.df = df
        
        # Removing stopwords.
        tfidf = TfidfVectorizer(stop_words='english')

        # Construct the required TF-IDF matrix by fitting and transforming the data.
        tfidf_matrix = tfidf.fit_transform(df['description'])

        # Compute the cosine similarity matrix.
        self.cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

        # Construct a Series of indexes for their respective content titles.
        self.indices = pd.Series(df.index, index=df['title'])
        
    def get_recommendation(self, title):
        """Recommends content using TF-IDF. """
        
        idx = self.indices[title]

        # Get the pairwsie similarity scores of all content with the specified content.
        sim_scores = list(enumerate(self.cosine_sim[idx]))

        # Sort the content based on the similarity scores.
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar content.
        sim_scores = sim_scores[1:11]

        # Get the content indices.
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar content.
        return self.df['title'].iloc[movie_indices]

In [None]:
def create_graph(df):
    """Constructs a NetworkX graph given a dataset. """
    
    # Attributes to ignore.
    ELEMENTS_IGNORE = ['date_added', 'release_year', 'rating', 'duration', 'description']
    
    NODE_LABELS = {1: 'DIRECTOR', 2: 'ACTOR/ACTRESS', 3: 'COUNTRY', 8: 'CATEGORY'}
    EDGE_LABELS = {1: 'directed_by', 2: 'acted_in', 3: 'produced_in', 8: 'listed_in'}
    
    NODE_COLORS = {1: 'yellow', 2: 'red', 3: 'green', 8: 'orange'}
    
    def helper(index, element, title_node):
        """Adds the element to the graph, if needed, and adds edge to the title node. """
        
        # Checks if the specified element is already in the graph.
        if (element not in G):
            # Adds the element node to the graph.
            G.add_node(element, label=NODE_LABELS[index])
            colors.append(NODE_COLORS[index])

        # Adds edge from the element node to the title node.
        G.add_edge(element, title_node, label=EDGE_LABELS[index])
    
    
    
    G = nx.Graph()
    colors = []

    # Iterates each row within the dataframe as (int(index), pd.Series(row)).
    for i, row in df.iterrows():
        row.reset_index(drop=True, inplace=True)
        
        title_node = None
        # Iterates over each attribute within the row as (int(index), Generic(element)).
        for j, element in row.items():
            # If the current element is the title, it is added to the graph and then moves on to the next element.
            if (j == 0):
                G.add_node(element, label='TITLE')
                colors.append('blue')
                title_node = element
                continue
       
            # Checks whether to ignore the specified element.
            if (df.columns.values.tolist()[j] in ELEMENTS_IGNORE):
                continue
            
            # If the specified element has inner elements, they're iterated over as well.
            if isinstance(element, list):
                for inner_element in element:
                   # Adds the inner elements to the graph.
                   helper(j, inner_element, title_node)
            else:
                # adds the element to the graph.
                helper(j, element, title_node)

    return G, colors


# Quick Look

Let's import the dataset and see what data is missing.

In [None]:
# Import dataset.
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

df.head()

In [None]:
print(df.isna().sum())

A quick summary of the columns:

* `type` - Categorizes the content as a 'Movie' or 'TV Show'
* `title` - The title of the content (unique?)
* `director` - The individual(s) who directed the content
* `cast` - The individual(s) who acted in the content
* `country` - The country/countries the content was produced in
* `date_added` - The year the content was added to Netflix
* `release_year` - The release year of the content
* `rating` - The content rating
* `duration` - The length of the content; minutes for movies and seasons for TV shows
* `listed_in` - The categories the content is listed in
* `description` - The description of the content

It's nice that the provided data looks like it can be used to model relationships between content and (hopefully) allow for a more robust recommender system.

Quite a few values are missing, noticeably for: `director`, `cast`, and `country`.

My hypothesis for the `director` column is that values are missing most likely due to TV shows not having a single director to list.

However, it is surprising to see that a decent amount of rows don't possess a `cast` column value, 
because this infers that whomever listed the product does not have any idea of what actors/actresses is in it.
That's a little weird, but let's stick with it for now.

Lastly, I'm not really sure what to think about the missing values from the `country` column.

# Data Cleanse

Some of the columns contain a single string where the values are separated by commas.
These need to be parsed to allow for manipulating these values later.

Though for `countries`, only the first country within the string is kept to simplify things.
This causes a bit of information loss, but I don't think it's significant for the recommender system.

The cleansing process is described fairly well within the comments.

In [None]:
# Remove show_id, redundant.
df = df.drop('show_id', axis=1)

# Replace NaN values.
df = df.replace(np.nan, 'Not Specified')

##### Splits strings into list of respective values.

df['director'] = df['director'].str.split(pat=',')
df['cast'] = df['cast'].str.split(pat=',')

# Assigns the first element in each list as the 'country'.
df['country'] = df['country'].apply(lambda x: x.split(',')[0])

# Converts the date into a list...
df['date_added'] = df['date_added'].str.split(pat='[,\s]\s*')
df['date_added'] = process_dates(df)

df['listed_in'] = df['listed_in'].str.split(pat='[,]\s*')

In [None]:
df.head()

As mentioned, the `type` column categorizes the content as a 'Movie' or a 'TV Show'.
The types possess similar attributes, as well as differing attributes.
So I think it would be best to create a 'Movie' dataframe and a 'TV Show' dataframe when analyzing type-specific attributes.

In [None]:
### TV show dataset.
df_tv_show = df[df['type'] == 'TV Show'].drop('type', axis=1)
# Parses 'X Seasons' into an integer.
df_tv_show['duration'] = df_tv_show['duration'].apply(lambda x: x.split()[0]).astype(int)
# Reset index.
df_tv_show.reset_index(drop=True, inplace=True)

### Movie dataset.
df_movie = df[df['type'] == 'Movie'].drop('type', axis=1)
# Parses 'X min' into an integer.
df_movie['duration'] = df_movie['duration'].apply(lambda x: x.split()[0]).astype(int)
# Reset index.
df_movie.reset_index(drop=True, inplace=True)

In [None]:
df_movie.head()

In [None]:
df_tv_show.head()

# Analysis

## Movie and TV Show Dataset

### Content Type

In [None]:
plt.figure(figsize=(5, 8))

ax = sns.countplot(x='type', data=df)

ax.set_title('Content Type vs. Count')
ax.set_xlabel('Type')
ax.set_ylabel('Count')

plt.show()

### Production Country

In [None]:
print(f"List of Production Counties:\n{np.sort(df['country'].unique())}")

##### Graphing
plt.figure(figsize=(16, 8))
plt.xticks(rotation=45)

ax = sns.countplot(x='country', hue='type', data=df, order=df['country'].value_counts().index[:10])

ax.set_title('Production Country vs. Count')
ax.set_xlabel('Country')
ax.set_ylabel('Count')
ax.legend(loc='upper right')

plt.show()

### Date Added

In [None]:
# This import registers the 3D projection, but is otherwise unused.
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import

print('Date Added by Count:')
print(df['date_added'].value_counts()[:10])

##### Graphing
# Fixing random state for reproducibility
np.random.seed(19680801)

fig = plt.figure(figsize=(16, 16))
ax = fig.add_subplot(111, projection='3d')

df_temp = df.drop(df[df['date_added'] == 'Not Specified'].index)

x = df_temp.date_added.map(lambda a: a[0])
y = df_temp.date_added.map(lambda a: a[1])

hist, xedges, yedges = np.histogram2d(x, y, bins=[12, 12], range=[[0, 13], [2010, 2021]])

# Construct arrays for the anchor positions of the 16 bars.
xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing="ij")
xpos = xpos.ravel()
ypos = ypos.ravel()
zpos = 0

# Construct arrays with the dimensions for the 16 bars.
dx = dy = 0.5 * np.ones_like(zpos)
dz = hist.ravel()

ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')

ax.set_title('Date Added vs. Count')
ax.set_xlabel('Month')
ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
ax.set_ylabel('Year')
ax.set_zlabel('Count')

plt.show()

The holiday season of a given year (October - December) is when the most content is added to Netflix.

### Release Year

In [None]:
print(f"List of Release Years:\n{np.sort(df['release_year'].unique())}")

##### Graphing
plt.figure(figsize=(16, 8))
plt.legend(loc='upper left')
plt.xticks(rotation=45)

ax = sns.countplot(x='release_year', hue='type', data=df, order=np.sort(df['release_year'].unique())[-50:])

ax.set_title('Release Year vs. Count')
ax.set_xlabel('Release Year')
ax.set_ylabel('Count')
ax.legend(loc='upper left')

plt.show()

The distribution is almost Poissonic... must investigate further.

### Content Rating

In [None]:
RATING_ORDER = [
    'Not Specified', 'NR'  , 'TV-Y' , 'TV-Y7', 'TV-Y7-FV', 
    'G'            , 'TV-G', 'PG'   , 'TV-PG', 'PG-13'   , 
    'TV-14'        , 'R'   , 'TV-MA', 'NC-17', 'UR'      ,
]

##### Graphing
plt.figure(figsize=(16, 8))
plt.legend(loc='upper left')

ax = sns.countplot(x='rating', hue='type', data=df, order=RATING_ORDER)

ax.set_title('Content Rating vs. Count')
ax.set_xlabel('Content Rating')
ax.set_ylabel('Count')
ax.legend(loc='upper left')

plt.show()

The distribution shows that the target demographic of Netflix is adolescents and adults.

## Movie Dataset

### Director

In [None]:
print(f"Not specified : {value_counter(df_movie, 'director').value_counts()[0]}")

##### Graphing
plt.figure(figsize=(16, 8))

movie_director = value_counter(df_movie, 'director')
ax = sns.countplot(y=movie_director, order=movie_director.value_counts().index[1:26])

ax.set_title('Top Featured Movie Directors')
ax.xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
ax.set_xlabel('Count')
ax.set_ylabel('Name')

plt.show()

### Duration

In [None]:
plt.figure(figsize=(16, 8))

ax = sns.histplot(df_movie['duration'], kde=True)

ax.set_title('Movie Duration vs. Count')
ax.set_xlabel('Duration (min.)')
ax.set_ylabel('Count')

plt.show()

### Categories (Listed In)

In [None]:
plt.figure(figsize=(16, 8))

movie_genres = value_counter(df_movie, 'listed_in')
ax = sns.countplot(y=movie_genres, order=np.sort(movie_genres.unique()))

ax.set_title('Count vs. Movie Categories')
ax.set_xlabel('Count')
ax.set_ylabel('Category')

plt.show()

### Cast

In [None]:
print(f"Not specified : {value_counter(df_movie, 'cast').value_counts()[0]}")

##### Graphing
plt.figure(figsize=(16, 8))

movie_cast = value_counter(df_movie, 'cast')
ax = sns.countplot(y=movie_cast, order=movie_cast.value_counts().index[1:26])

ax.set_title('Top Featured Movie Actors/Actresses')
ax.set_xlabel('Count')
ax.set_ylabel('Name')

plt.show()

A majority of the actors/actresses are of Indian descent, most likely due to the size of movie production in India.

### Graph

In [None]:
# Generates a dataframe consisting of N random rows.
df_temp = df_movie.sample(n=100).reset_index(drop=True)

df_temp

In [None]:
# Creates the graph and colormap.
graph, c = create_graph(df_temp)

##### Graphing
plt.figure(figsize=(16, 16))

nx.draw(graph, with_labels=False, node_color=c)
plt.show()

## TV Show Dataset

### Director

In [None]:
print(f"Not specified : {value_counter(df_tv_show, 'director').value_counts()[0]}")

##### Graphing
plt.figure(figsize=(16, 8))

tv_show_director = value_counter(df_tv_show, 'director')
ax = sns.countplot(y=tv_show_director, order=tv_show_director.value_counts().index[1:26])

ax.set_title('Top Featured TV Show Directors')
ax.xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
ax.set_xlabel('Count')
ax.set_ylabel('Name')

plt.show()

### Duration

In [None]:
plt.figure(figsize=(16, 8))

ax = sns.countplot(x='duration', data=df_tv_show, order=np.sort(df_tv_show['duration'].unique()))

ax.set_title('TV Show Duration vs. Count')
ax.set_xlabel('Duration (Seasons)')
ax.set_ylabel('Count')

plt.show()

A majority of TV shows are only one season; this is most likely due to the popularity filter that determines what shows are renewed for another season.
Most shows usually don't make it past a first season, especially Netflix originals.

### Categories (Listed In)

In [None]:
plt.figure(figsize=(16, 8))

tv_show_genres = value_counter(df_tv_show, 'listed_in')
ax = sns.countplot(y=tv_show_genres, order=np.sort(tv_show_genres.unique()))

ax.set_title('Count vs. TV Show Categories')
ax.set_xlabel('Count')
ax.set_ylabel('Category')

plt.show()

### Cast

In [None]:
print(f"Not specified : {value_counter(df_tv_show, 'cast').value_counts()[0]}")

##### Graphing
plt.figure(figsize=(16, 8))

tv_show_cast = value_counter(df_tv_show, 'cast')
ax = sns.countplot(y=tv_show_cast, order=tv_show_cast.value_counts().index[1:26])

ax.set_title('Top Featured TV Show Actors/Actresses')
ax.set_xlabel('Count')
ax.set_ylabel('Name')

plt.show()

A majority of actors/actresses are of Japanese descent, most likely due to the small number of voice actors in Japanese animation.

### Graph

In [None]:
# Generates a dataframe consisting of N random rows.
df_temp = df_tv_show.sample(n=100).reset_index(drop=True)

df_temp

In [None]:
# Creates the graph and colormap.
graph, c = create_graph(df_temp)

##### Graphing
plt.figure(figsize=(16, 16))

nx.draw(graph, with_labels=False, node_color=c)
plt.show()

# Recommender System

## TF-IDF

Using the Term Frequency Inverse Document Frequency (TF-IDF) technique, it's possible to convert the content description into a vector.
This is useful because we can use cosine similarity (analogous to the dot product) to determine the 'closest' content.

### Movie

In [None]:
movie_recommender = Recommender(df_movie)

print(movie_recommender.get_recommendation("21"))

### TV Show

In [None]:
tv_show_recommender = Recommender(df_tv_show)

print(tv_show_recommender.get_recommendation("Stranger Things"))

## Deep Learning(?)

***WORK IN PROGRESS***

# Citations

1.  https://www.kaggle.com/niharika41298/netflix-visualizations-recommendation-eda#Recommendation-System-(Content-Based)
2.  http://tfidf.com/
3.  https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
4.  https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/
5.  https://en.wikipedia.org/wiki/Cosine_similarity
6.  https://www.kaggle.com/yclaudel/recommendation-engine-with-networkx
7.  https://networkx.org/documentation/stable/tutorial.html
8.  https://en.wikipedia.org/wiki/Adamic/Adar_index
9.  https://neo4j.com/docs/graph-algorithms/current/labs-algorithms/adamic-adar/