[](http://)# My objectives are to show different approaches for recommendation system on the huge Netflix dataset:

# 0. Netflix dataset statistics
# 1. Model-based CF - matrix factorization methods
# 2. Model-based CF - clustering models methods
# 3. Memory-based CF - statistic correlation coefficient methods
> # 4. Future - I will try to compare all the different method, ATM i still did not find any way to do it effectively



---



# First i will analyze the dataset
Data loading

Each data file (there are 4 of them) contains below columns:

Movie ID (as first line of each new movie record / file)

Customer ID

Rating (1 to 5)

Date they gave the ratings

There is another file contains the mapping of Movie ID to the movie background like name, year of release, etc

Let's import the library we needed before we get started:

In [None]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Dataset, SVD,  SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering, accuracy 
from surprise.reader import Reader
from surprise.model_selection.validation import cross_validate as cross_validate
sns.set_style("darkgrid")

Next let's load first data file and get a feeling of how huge the dataset is:

In [None]:
# Skip date
df1 = pd.read_csv('../input/netflix-prize-data/combined_data_1.txt', header = None, names = ['CustomerID','Rating', 'Date'], usecols = [0,1, 2])
df1['Rating'] = df1['Rating'].astype(float)

df = df1

 Let's also load the 3 remaining dataset as well<br>
****it's on a seperate commented block because it is too heavy to load all datases on every test run - though it needs to be uncommented when the best accuracies are needed****:

In [None]:
# df2 = pd.read_csv('../input/netflix-prize-data/combined_data_2.txt', header = None, names = ['CustomerID', 'Rating',  'Date'], usecols = [0,1, 2])
# df3 = pd.read_csv('../input/netflix-prize-data/combined_data_3.txt', header = None, names = ['CustomerID', 'Rating',  'Date'], usecols = [0,1, 2])
# df4 = pd.read_csv('../input/netflix-prize-data/combined_data_4.txt', header = None, names = ['CustomerID', 'Rating',  'Date'], usecols = [0,1, 2])

# df2['Rating'] = df2['Rating'].astype(float)
# df3['Rating'] = df3['Rating'].astype(float)
# df4['Rating'] = df4['Rating'].astype(float)

# print('Dataset 2 shape: {}'.format(df2.shape))
# print('Dataset 3 shape: {}'.format(df3.shape))
# print('Dataset 4 shape: {}'.format(df4.shape))

# df = df1.append(df2)
# df = df.append(df3)
# df = df.append(df4)

![](http://)lets peek at the data set

In [None]:
df.index = np.arange(0,len(df))
print('Full dataset shape: {}'.format(df.shape))
print('-Dataset examples-')
print(df.iloc[::5000000, :])

Now we load the movie mapping file:

In [None]:
df_movie_titles = pd.read_csv('../input/netflix-prize-data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
df_titles = df_movie_titles.set_index('Movie_Id', inplace = False)
print (df_movie_titles.head(10))
print(list(df_movie_titles.columns))
print (df_titles.head(10))
print(list(df_titles.columns))

# Let's give a first look on how the data spread:

In [None]:
p = df.groupby('Rating')['Rating'].agg(['count'])

# get movie count
movie_count = df.isnull().sum()[1]

# get customer count
cust_count = df['CustomerID'].nunique() - movie_count

# get rating count
rating_count = df['CustomerID'].count() - movie_count

ax = p.plot(kind = 'barh', legend = False, figsize = (15,10))
plt.title('Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(movie_count, cust_count, rating_count), fontsize=20)
plt.axis('off')

for i in range(1,6):
    ax.text(p.iloc[i-1][0]/4, i-1, 'Rating {}: {:.0f}%'.format(i, p.iloc[i-1][0]*100 / p.sum()[0]), color = 'white', weight = 'bold')

Lets watch some customers in a customer/rating graph:

In [None]:
def get_boxplot_of_categories(data_frame, categorical_column, numerical_column, limit):
    import seaborn as sns
    from collections import Counter
    keys = []
    for i in dict(Counter(df[categorical_column].values).most_common(limit)):
        keys.append(i)
    print(keys)
    df_new = df[df[categorical_column].isin(keys)]
    sns.set()
    sns.boxplot(x = df_new[categorical_column], y =      df_new[numerical_column])

In [None]:
get_boxplot_of_categories(df_titles, 'CustomerID', 'Rating', 10)

Lets watch the distribution over movies release dates:  

In [None]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot

data = df_movie_titles['Year'].value_counts().sort_index()

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movies Grouped By Year Of Release'.format(df_movie_titles.shape[0]),
              xaxis = dict(title = 'Release Year'),
              yaxis = dict(title = 'Movies'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

How are the ratings distributed?

In [None]:
# Get data
data = df['Rating'].value_counts().sort_index(ascending=False)

# Create trace
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Distribution Of {} Netflix-Ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

> We can see that the rating tends to be relatively positive (>3). This may be due to the fact that unhappy customers tend to just leave instead of making efforts to rate. so low rating movies mean they are generally really bad..

When Have The Movies Been Rated?

In [None]:
data = df['Date'].value_counts()
data.index = pd.to_datetime(data.index)
data.sort_index(inplace=True)

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movie-Ratings Grouped By Day'.format(df.shape[0]),
              xaxis = dict(title = 'Date'),
              yaxis = dict(title = 'Ratings'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

With beginning of november 2005 a strange decline in ratings can be observed. Furthermore two unnormal peaks are in january and april 2005.

Now lets add the movies column:

In [None]:
df_nan = pd.DataFrame(pd.isnull(df.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):

    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

df = df[pd.notnull(df['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['CustomerID'] = df['CustomerID'].astype(int)
print('-Dataset examples-')
print(df.iloc[::5000000, :])


How Are The Number Of Ratings Distributed For The Movies And The Users?

In [None]:
##### Ratings Per Movie #####
# Get data
data = df.groupby('Movie_Id')['Rating'].count().clip(upper=9999)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 10000,
                                  size = 100),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per Movie (Clipped at 9999)',
                   xaxis = dict(title = 'Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)



##### Ratings Per User #####
# Get data
data = df.groupby('CustomerID')['Rating'].count().clip(upper=199)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 200,
                                  size = 2),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per User (Clipped at 199)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

We can see that most customers tend to rate less than 20 movies

---

Now, to work!

---

# Data slicing
The data set now is super huge and i cant work with it in the current form, so i will reduce the data volumn by improving the data quality below:

Remove movie with too less reviews (they are relatively not popular)
Remove customer who give too less reviews (they are relatively less active)
Having above benchmark will have significant improvement on efficiency, since those unpopular movies and non-active customers still occupy same volumn as those popular movies and active customers in the view of matrix (NaN still occupy space). This should help improve the statistical signifiance too.

In [None]:
movies_percentile = 0.7
#Movies rate count percentile
#I will leave only movies on the (1-movies_percentile) percentile with respect to movies rating count

customers_percentile = 0.7
#Customers rate count percentile
#I will leave only customers on the (1-customers_percentile) percentile with respect to customers rating count


df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(['count'])
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(movies_percentile),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

df_cust_summary = df.groupby('CustomerID')['Rating'].agg(['count'])
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(customers_percentile),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Movies minimum rating count: {}'.format(movie_benchmark))
print('Customers minimum rating count: {}'.format(cust_benchmark))

print('Original Shape: {}'.format(df.shape))
df = df[~df['Movie_Id'].isin(drop_movie_list)]
df = df[~df['CustomerID'].isin(drop_cust_list)]

print('After Trim Shape: {}'.format(df.shape))

print('unique movies left:')
print(df['Movie_Id'].unique().size)
print('unique customers left:')
print(df['CustomerID'].unique().size)

Now i will pivot the dataset and convert it into a matrix M, 
where Mi,j is the rating the ith customer gave to the jth movie

I will also replace all NaN values with zeros - and should keep in mind that there is no zero rating - the rating ranges from 1 to 5,
so the value '0' will state that this movie was not being reviewed and not that it's given rating is zero.

In [None]:
df_p = pd.pivot_table(df,values='Rating',index='CustomerID',columns='Movie_Id')
df_p = df_p.fillna(0)
print(df_p.head(10))

---

# A  recommendation system is a subclass of information filtering system that seeks to predict the "rating" a user would give to an item

# Task1 - Model-based CF - matrix factorization methods

1. [Surprise](http://surpriselib.com/) is a Python scikit building and analyzing recommender systems that deal with explicit rating data.<br>
I will use this library for the perpose of trying to recommend Netflix movies to Netflix users

# I will try the following models:

> SVD, SVDpp, NMF, NormalPredictor CoClustering

Matrix Factorization-based algorithms
> SVD<br>
> SVD algorithm is equivalent to Probabilistic Matrix Factorization<br>
> SVDpp<br>
> The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.<br>
> NMF<br>
> NMF is a collaborative filtering algorithm based on Non-negative Matrix Factorization. It is very similar with SVD.<br>
> Slope One<br>
> SlopeOne is a straightforward implementation of the SlopeOne algorithm.<br>
> Co-clustering<br>
> Coclustering is a collaborative filtering algorithm based on co-clustering.<br>

Step1 - evaluate:

Perform 3-folds cross validation in order to determine the best predictor.<br>
For the accuracy metric i use “rmse” - root squared error.

**All the algorithms below excpects a dataset with the following scheme 'CustomerID', 'Movie_Id', 'Rating': <br>
The return is a function F: CustomerID -> Rating **

In [None]:
def cross_validate_cf_algorithms(rows):

    reader = Reader()

    data = Dataset.load_from_df(df[['CustomerID', 'Movie_Id', 'Rating']][:rows], reader)

    benchmark = []
    # Iterate over all algorithms
    for algorithm in [SVD(), SVDpp(), NMF(), NormalPredictor(),  CoClustering()]:
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark.append(tmp)

    print(pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse'))

In [None]:
# get just top 100K rows for faster run time
cross_validate_cf_algorithms(100000)


SVD performed best

# Step2 - train the best model - SVD:

Return the top-N recommendation for each user from a set of predictions

In [None]:

from collections import defaultdict
def get_top_n(predictions, n=5):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


A simple function to retrieve the movie name from the movie ID

In [None]:
df_m = df.set_index('Movie_Id')
names_movie_mapping = df_titles.join(df_m)
print(names_movie_mapping)

def get_movie_name(id):
    
    return names_movie_mapping.loc[names_movie_mapping.index == id, 'Name'].unique()[0]

In [None]:
get_movie_name(id=1)

# Train SVD model:

In [None]:
reader = Reader()
data = Dataset.load_from_df(df[['CustomerID', 'Movie_Id', 'Rating']][:100000], reader)
trainset = data.build_full_trainset()

algo = SVD()
predictions = algo.fit(trainset)

# Print the recommanded movie for each customer on created test set:

In [None]:


testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended movies for each user
recommanded_movies = {}
for uid, user_ratings in top_n.items():

    recommanded_movies[uid] = [get_movie_name(movie_id) for (movie_id, _) in user_ratings]
    print(uid, recommanded_movies[uid])



# Now we have a recommendation function

The function's returns the recommanded movie for the customer:<br>
The prediction rule is to take each movie that the customer loved (rating = 5) , than for each movie predict using SVD and finaly take most frequent movie - **only because i want to output just a single value**:

In [None]:
def recommand_SVD(CustomerID):

    res = []

    for x in df[(df['CustomerID'] == CustomerID) & (df['Rating'] == 5)]['Movie_Id']:
        
        p = algo.predict(CustomerID, x)[1]      
        res.append(p) 

        
    return get_movie_name(np.bincount(res).argmax())

Let's predict which movies a specific user would love to watch:

In [None]:
print("The recommanded movie for customer 1333 using clustering CF is: ", recommand_SVD(1333))

---

# Task2 - Model-based CF- clustering models

# Clusterize Netflix movies

I will try to clusterize the movies based on the rating recieved from the users.


The steps:
1. prepare the dataset to sklearn

2.  Compare few clustering algorithms

    *   DBSCAN
    *   K-means
    *   XXX

3.   Create a function F that will map the movies to their corresponding clusters (F: movieID -> movieClusterId)






import the required SKlearn libraries:

In [None]:
from sklearn_pandas import DataFrameMapper, cross_val_score
import numpy as np
import sklearn.preprocessing, sklearn.decomposition, sklearn.linear_model, sklearn.pipeline, sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import DBSCAN, KMeans
from sklearn import metrics
from sklearn.decomposition import PCA,SparsePCA, TruncatedSVD, NMF
from sklearn.preprocessing import StandardScaler
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.impute import SimpleImputer
import random

Preparing the dataset - i would like each movie to be represented by it's column, that is, all rating recieved by all users,
The problem is the sparsness, that is, the zero ratings,
So, my solution for this will be to replace all zeroes rating with the mean of all non zeroes ratings.

In [None]:
X = df_p.as_matrix(columns=df_p.columns[:]).transpose()

imp_mean = SimpleImputer(missing_values=0, strategy='mean')

Lets reduce the dimensions first- for that i will use PCA - lets find the best size for the new dimension - n_component parameter in sklearn

In [None]:
pca = PCA().fit(X)

#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Movies Dataset Explained Variance')
plt.show()

We can see that we can reduce the size to 600 (reducing it by more than 1 / 2) with a minimal loss of 0.1 of the variance
So now i will use PCA with n_component = 600 to reduce X to 600 dimensions

In [None]:
pca = PCA(n_components=600)
X_reduced = pca.fit_transform(X)

---

Now i will try K-means on the reduced data with K in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]<br>
Than, for each result of k ill plot the **silhouette score** and than i will pick the best value for K

In [None]:
range_n_clusters = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
data = X_reduced

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(data)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(data, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(data, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(data[:, 0], data[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

So as we can see, the best K will be 35 with a Silhouette value of 0.86,<br>
Lets train the model again with 35 clusters:


In [None]:
    clusterer = KMeans(n_clusters=35, random_state=10)
    cluster_labels = clusterer.fit_predict(X_reduced)

    silhouette = silhouette_score(X_reduced, cluster_labels)
    print("For n_clusters =", 35,
          "The silhouette_score is :", silhouette)


---

Next i will try DBScan algorithm

In [None]:
db = DBSCAN().fit(X_reduced)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
if n_clusters_ > 1:
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels))

Infortunatly DBscan did not manage to find clusters in our data..

Now i will try some more clustering algorithms:<br>
'MiniBatchKMeans', 'AffinityPropagation', 'SpectralClustering', 'Ward', 'AgglomerativeClustering', 'OPTICS', 'Birch', 'GaussianMixture'<br>
i will also plot the resulting clusters and print the scores (only if more than 10 clusters found)

In [None]:

import time
import warnings

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

np.random.seed(0)

# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = len(X_reduced[0])

# ============
# Set up cluster parameters
# ============
plt.figure(figsize=(9 * 2 + 3, 12.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                    hspace=.01)

plot_num = 1

default_base = {'quantile': .3,
                'eps': .3,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 35,
                'min_samples': 20,
                'xi': 0.05,
                'min_cluster_size': 0.01}

datasets = [
    (X_reduced, {'damping': .77, 'preference': -240,
                     'quantile': .2, 'n_clusters': 35,
                     'min_samples': 20, 'xi': 0.25})]

for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)

    X = dataset

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(
        X, n_neighbors=params['n_neighbors'], include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(
        n_clusters=params['n_clusters'], linkage='ward',
        connectivity=connectivity)
    spectral = cluster.SpectralClustering(
        n_clusters=params['n_clusters'], eigen_solver='arpack',
        affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=params['min_samples'],
                            xi=params['xi'],
                            min_cluster_size=params['min_cluster_size'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average", affinity="cityblock",
        n_clusters=params['n_clusters'], connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(
        n_components=params['n_clusters'], covariance_type='full')

    clustering_algorithms = (
        ('MiniBatchKMeans', two_means),
        ('AffinityPropagation', affinity_propagation),
        ('MeanShift', ms),
        ('SpectralClustering', spectral),
        ('Ward', ward),
        ('AgglomerativeClustering', average_linkage),
        ('DBSCAN', dbscan),
        ('OPTICS', optics),
        ('Birch', birch),
        ('GaussianMixture', gmm)
    )

    for name, algorithm in clustering_algorithms:
        t0 = time.time()

        # catch warnings related to kneighbors_graph
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="the number of connected components of the " +
                "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning)
            warnings.filterwarnings(
                "ignore",
                message="Graph is not fully connected, spectral embedding" +
                " may not work as expected.",
                category=UserWarning)
            algorithm.fit(X)
        
        t1 = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
            
            if (len(np.unique(algorithm.labels_)) > 1):
                print(name + " silhouette_avg: ", silhouette_score(X, algorithm.labels_))
        else:
            y_pred = algorithm.predict(X)

        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)

        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                      int(max(y_pred) + 1))))
        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        plot_num += 1

plt.show()

As we can see 'Ward' performed best with a score of 0.61, it is a good result but still less than KMeans.

# So the best clustering algorithm for our task is KMeans with K=35

# The recommandation function:<br>
Returns the recommanded movie for the customer:<br>
The prediction rule is to take each movie that the customer loved (rating = 5)<br>
Take it's corresponding row from the dataset - (all customers are the features in every row)<br>
Reduce the dimension with PCA<br>
Predict using KMeans , K=35<br>
Take a random movie from this cluster<br>
Finaly take most frequent movie:

Below is a simple function that returns a random movie that is inside a given cluster

In [None]:
def get_movie(cluster):
     return random.choice(np.squeeze(np.argwhere(cluster_labels==cluster)))

In [None]:
get_movie(1)

In [None]:
def recommand_KMeans(CustomerID):
    
    res = []

    for movieID in df[(df['CustomerID'] == CustomerID) & (df['Rating'] == 5)]['Movie_Id']:
        
        x = np.squeeze(df_p.as_matrix(columns=[movieID])).reshape(1, -1) 
         
        transformed = pca.transform(x)
        p = np.squeeze(clusterer.predict(transformed))
        res.append(get_movie(p))   
        
        
    return get_movie_name(np.bincount(res).argmax())
    

In [None]:
print("The recommanded movie(ID) for customer 1333 using clustering CF is: ", recommand_KMeans(1333))

---

# Task3 - Memory-based CF

# I will try the following correlation coefficients:

> Pearson, Kendall, Spearman 

The idea here is to measure the linear correlation between rating of all pairs of movies

In [None]:
def get_similar_movies(method_name, movie_title, n_movies, min_count=0):

    i = int(df_titles.index[df_titles['Name'] == movie_title][0])
    target = df_p[i]
    similar_to_target = df_p.corrwith(target, method=method_name)
    corr_target = pd.DataFrame(similar_to_target, columns = [method_name])
    corr_target.dropna(inplace = True)
    corr_target = corr_target.sort_values(method_name, ascending = False)
    corr_target.index = corr_target.index.map(int)
    corr_target = corr_target.join(df_titles).join(df_movie_summary)[[method_name, 'Name', 'count']]
    return [name for name in corr_target[corr_target['count']>min_count][:n_movies]['Name']]


# The recommandation function:<br>
Returns the recommanded movie for the customer:<br>
The prediction rule is to take all the movies that the customer loved (rating = 5), than take the corresponding movieID column from the dataset - find the highest correlated column of another movie and return it, do it for each of the movies that the customer loved.

In [None]:
# n_movies - is the maximum number of movies to return
# as_names - the names of the movies / only the id 

def recommand_corr(method_name, CustomerID):

    res = []

    for x in df[(df['CustomerID'] == CustomerID) & (df['Rating'] == 5)]['Movie_Id']:
        p = get_similar_movies(method_name, get_movie_name(x), 1)[0]
        res.append(p)    

        
    return res

In [None]:

for method in ['pearson', 'kendall', 'spearman']:
    print(method, recommand_corr(method, 1333))



---

# My future work..

My next step will be to try and comapre all ther different approaches.

---

**Thanks alot for the greate class, (Yoram you were greate!)**