In [1]:
from __future__ import unicode_literals, print_function
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise.model_selection import KFold
from surprise import NormalPredictor
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
reader = Reader(rating_scale=(1, 5))
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
from prompt_toolkit import print_formatted_text, HTML

import warnings; warnings.simplefilter('ignore')
df = pd.read_csv('train.csv')
# df = df.sample(frac=0.005)

# Visualising the data and making observations

In this section, we will make observations of the data after we:
    1. Visualise the distribution of the ratings(1-5) of the movies on a bar chart
    2. Visualise the distribution for the number of ratings per movie
    3. View the top 10 most frequently rated movies
    4. Visualise the distribution for the number of ratings per user

### Observations made:

    1. The majority of movies are rated 3 and higher
    2. With 4 being the most frequent rating

    3. Few movies have more than 10 ratings 
    4. With the bulk of them having only 2 or less ratings

    5. The 10 most rated movies have more than 20 000 ratings
    6. With the most rated movie having 32 831

    7. 9.95% of users rated less than 10 movies
    8. while 47.63% of users rated 30 or more movies


In [2]:
def charts(df):
    
     '''
    Plots multiple charts to visualise the data for better understanding.

            Parameters:
                    a (DataFrame): A Pandas DataFrame

    '''
        
    data = df['rating'].value_counts().sort_index(ascending=False)
    trace = go.Bar(x = data.index,
                   text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
                   textposition = 'auto',
                   textfont = dict(color = '#000000'),
                   y = data.values,
                   )
    # Create layout
    layout = dict(title = 'Distribution of movie-ratings',
                  xaxis = dict(title = 'Rating'),
                  yaxis = dict(title = 'Count'))
    # Create plot
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)
    print("Observations:")
    print("    1. 82% of movies are rated 3 and higher.")
    print("    2. Most movies are rated 4.")
    print("\n")
    print("\n")
    print("\n")
    print("\n")
    
    
    data = df.groupby('movieId')['rating'].count().clip(upper=50)

    # Create trace
    trace = go.Histogram(x = data.values,
                         name = 'Ratings',
                         xbins = dict(start = 0,
                                      end = 50,
                                      size = 2))
    # Create layout
    layout = go.Layout(title = 'Distribution Of Number of Ratings Per Movie (Clipped at 50)',
                       xaxis = dict(title = 'Number of Ratings Per Movie'),
                       yaxis = dict(title = 'Count'),
                       bargap = 0.2)

    # Create plot
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)
    dff = df.groupby('movieId')['rating'].count().clip(upper=50) <= 2
    
    print("Observations:")
    print("    1. Most movies have less than 10 ratings. ")
    print("    2." + str(len(dff)) + " out of " + str(len(df)) + " have 2 or less ratings")
    print("\n")
    print("\n")
    print("\n")
    print("\n")
    
    print("The top 10 most rated movies")
    print("\n")
    print(df.groupby('movieId')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10])
    print("\n")
    
    
    print("Observations:")
    print("    1. The 10 most rated movies has 20 000 ratings or more.")
    print("    2. The most rated movie has 32 831 ratings.")
    print("\n")
    print("\n")
    print("\n")
    print("\n")
    
    data = df.groupby('userId')['rating'].count().clip(upper=50)

    # Create trace
    trace = go.Histogram(x = data.values,
                         name = 'Ratings',
                         xbins = dict(start = 0,
                                      end = 50,
                                      size = 2))
    # Create layout
    layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
                       xaxis = dict(title = 'Ratings Per User'),
                       yaxis = dict(title = 'Count'),
                       bargap = 0.2)

    # Create plot
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)
    
    dff =  df.groupby('userId')['rating'].count()
    less_than_10 =  len(dff[dff < 10])
    percentage = round(less_than_10/len(dff) * 100, 2)
    
    dff2 =  df.groupby('userId')['rating'].count() 
    greater_than_30 =  len(dff2[dff2 > 30]) 
    percentage2 = round(greater_than_30/len(dff2) * 100, 2)
    
    print("Observations:")
    print("    1. " + str(percentage) + "% of the users rated less than 10 movies.")
    print("    2. "+ str(percentage2) + "% of the users rated 30 or more movies.")

charts(df)

Observations:
    1. 82% of movies are rated 3 and higher.
    2. Most movies are rated 4.










Observations:
    1. Most movies have less than 10 ratings. 
    2.18301 out of 500002 have 2 or less ratings








The top 10 most rated movies


      movieId  rating
304       318    1708
282       296    1642
341       356    1590
567       593    1433
2315     2571    1401
248       260    1355
460       480    1285
507       527    1202
1084     1196    1190
2666     2959    1177


Observations:
    1. The 10 most rated movies has 20 000 ratings or more.
    2. The most rated movie has 32 831 ratings.










Observations:
    1. 90.32% of the users rated less than 10 movies.
    2. 0.82% of the users rated 30 or more movies.


# Filtering the data

The below cell filters the data so only movies with more than 50 ratings and users that rated more than 10 movies are kept. 

In [None]:
def pre_processing(df, x, z):
     '''
    Returns a filtered Dataframe where only movies with x or more ratings and users with y or more ratings are kept.

            Parameters:
                    a (Pandas Dataframe): A Pandas DataFrame
                    b (int): An intiger specifying the threshold for the amount of ratings a movie should have more than
                    c (int): An intiger specifying the threshold for the amount of ratings a user should have gave more than

            Returns:
                     (Pandas DataFrame)
    '''
        
#   Filtering the data so only movies with 10 or more ratings are kept.
    greater_than_x = [i for i in df["movieId"] if len(df[df["movieId"] == i]) > x]
    dff = df[df["movieId"].isin(greater_than_x)]

    greater_than_z = [i for i in dff["userId"] if len(dff[dff["userId"] == i]) > z]
    dff2 = dff[dff["userId"].isin(greater_than_z)]

    print('Original data frame shape: '+ str(df.shape))
    print('New data frame shape: ' + str(dff2.shape))
    return dff2

In [None]:
dff = pre_processing(df)

# Selecting a model

The below function picsk a model with the lowest rmse

In [None]:
def model_selection(df):
     '''
    Returns the sum of two decimal numbers in binary digits.

            Parameters:
                    a (int): A decimal integer
                    b (int): Another decimal integer

            Returns:
                    binary_sum (str): Binary string of the sum of a and b
    '''
        
        
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(dff[['userId', 'movieId', 'rating']], reader)
    
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE'], n_jobs=-1, cv=3, verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark.append(tmp)

    return pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
    
    

In [None]:
model_selection(dff)