In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datetime import datetime
import os
import random
import matplotlib
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix

from sklearn.decomposition import TruncatedSVD
#from sklearn.metrics.pariwise import cosine_similarity

In [None]:
start = datetime.now()
if not os.path.isfile('data.csv'):
    #read all txt file and store them in one big file
    data = open('data.csv', mode='w')
    
    row = list()
    files = ['../input/netflix-prize-data/combined_data_1.txt', '../input/netflix-prize-data/combined_data_2.txt',
            '../input/netflix-prize-data/combined_data_3.txt', '../input/netflix-prize-data/combined_data_4.txt']
    for file in files:
        print('reading ratings from {}...'.format(file))
        with open(file) as f:
            for line in f:
                del row[:]
                line = line.strip()
                if line.endswith(':'):
                    #all are rating
                    movid_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movid_id)
                    data.write(','.join(row))
                    data.write('\n')
        print('Done.\n')
    data.close()
print('time taken:', datetime.now() - start)

In [None]:
print('creating the dataframe from data.csv file..')
df = pd.read_csv('data.csv', sep=',', names=['movie','user','rating','date'])

df.date = pd.to_datetime(df.date)
print('Done.\n')

#arranging the rating according to time
print('sorting the dataframe by date..')
df.sort_values(by='date', inplace=True)
print('sorting done.')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.describe()['rating']

**Checking NaN values**

In [None]:
print('number of NaN values in our dataset:', sum(df.isnull().any()))

**Check and Remove Duplicate**

In [None]:
dup = df.duplicated(['movie','user','rating'])
dups = sum(dup) #considering by column
print('there are {} duplicate rating entries in the data.....'.format(dups))

In [None]:
print('Total Data')
print("-"*60)
print('\nTotal number of rating:', df.shape[0])
print('Total number of users:', len(np.unique(df.user)))
print('total number of movie:', len(np.unique(df.movie)))

**Split the dataset**

In [None]:
if not os.path.isfile('train.csv'):
    #create a dataframe and store it
    df.iloc[:int(df.shape[0]*0.80)].to_csv("train.csv", index=False)
if not os.path.isfile('test.csv'):
    #create a dataframe and store it
    df.iloc[int(df.shape[0]*0.80)].to_csv("test.csv", index=False)

train_df = pd.read_csv('train.csv', parse_dates=['date'])
test_df = pd.read_csv('test.csv')

In [None]:
train_df.head()

In [None]:
train_df = train_df.drop(columns = 'date')
train_df.head()

In [None]:
test_df.shape

In [None]:
print('Total number of rating:',train_df.shape[0])
print('Total number of users:', len(np.unique(train_df.user)))
print('Total number of movies:', len(np.unique(train_df.movie)))

In [None]:
test_df.head()

In [None]:
print('Total number of rating:',test_df.shape[0])

**EDA on Train Data**

In [None]:
def human(num, units='M'):
    units = units.lower()
    num = float(num)
    if units == 'k':
        return str(num/10**3) + "K"
    elif units == 'm':
        return str(num/10**6) + "M"
    elif units == 'b':
        return str(num/10**9) + "B"

Distribution

In [None]:
fig, ax = plt.subplots()
plt.title('Rating Distribution over train data', fontsize=10)
sns.countplot(train_df.rating, palette="Set2")
ax.set_yticklabels([human(item,'M') for item in ax.get_yticks()])
ax.set_ylabel('No. of Ratings (Million)')
plt.show()

From the above distribution we see that most people give a rating of 4 and few people gave a rating of 1.

In [None]:
no_of_rated_movie_per_user = train_df.groupby(by='user')['rating'].count().sort_values(ascending=False)

In [None]:
no_of_rated_movie_per_user.describe()

Creating sparse matrix from data frame

In [None]:
start = datetime.now()
if os.path.isfile('train_sparse_matrix.npz'):
    train_sparse_matrix = sparse.load_npz('train_sparse_matrix.npz')
else:
    train_sparse_matrix = sparse.csr_matrix((train_df.rating.values, (train_df.user.values, train_df.movie.values)),)
    print('It is shape is:(user, movie):', train_sparse_matrix.shape)
    
print(datetime.now() - start)

Sparsity of Train Sparse Matrix

In [None]:
us, mv = train_sparse_matrix.shape
elem = train_sparse_matrix.count_nonzero()

print(elem)

In [None]:
print('sparsity of train matrix:{}%'.format((1-(elem/us*mv)))*100)

**Find Average of all movie ratings, average rating per user, average rating per movie**

In [None]:
def get_average_ratings(sparse_matrix, of_users):
    #avg rating from user
    ax = 1 if of_users else 0
    
    #'.A1' is for converting column_matrix to 1-D numpy array
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    
    #boolean matrix of ratings (user read or not)
    is_rated = sparse_matrix!=0
    
    #no.of ratings that each user
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    u,m = sparse_matrix.shape
    
    #create a dictionary of users and their avg 
    average_ratings = {i : sum_of_ratings[i]/no_of_ratings[i] for i in range(u if of_users else m) if no_of_ratings[i]!=0}
    
    return average_ratings

**Global average of all movie ratings**

In [None]:
train_averages = dict()

#get global average 
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average
train_averages

Avg Rating per user

In [None]:
train_averages['user'] = get_average_ratings(train_sparse_matrix, of_users=True)
print('\nAverage rating of user 10 :',train_averages['user'][10])

Avg Rating per movie

In [None]:
train_averages['movie'] = get_average_ratings(train_sparse_matrix, of_users=False)
print('\n Average rating of movie 15:', train_averages['movie'][15])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def compute_user_similarity(sparse_matrix, compute_for_few=False, top = 100, verbose=False, verb_for_n_rows = 20,
                            draw_time_taken=True):
    no_of_users, _ = sparse_matrix.shape
    # get the indices of  non zero rows(users) from our sparse matrix
    row_ind, col_ind = sparse_matrix.nonzero()
    row_ind = sorted(set(row_ind)) # we don't have to
    time_taken = list() #  time taken for finding similar users for an user..
    
    # we create rows, cols, and data lists.., which can be used to create sparse matrices
    rows, cols, data = list(), list(), list()
    if verbose: print("Computing top",top,"similarities for each user..")
    
    start = datetime.now()
    temp = 0
    
    for row in row_ind[:top] if compute_for_few else row_ind:
        temp = temp+1
        prev = datetime.now()
        
        # get the similarity row for this user with all other users
        sim = cosine_similarity(sparse_matrix.getrow(row), sparse_matrix).ravel()
        # We will get only the top ''top'' most similar users and ignore rest of them..
        top_sim_ind = sim.argsort()[-top:]
        top_sim_val = sim[top_sim_ind]
                # add them to our rows, cols and data
        rows.extend([row]*top)
        cols.extend(top_sim_ind)
        data.extend(top_sim_val)
        time_taken.append(datetime.now().timestamp() - prev.timestamp())
        if verbose:
            if temp%verb_for_n_rows == 0:
                print("computing done for {} users [  time elapsed : {}  ]"
                      .format(temp, datetime.now()-start))
            
        
    # lets create sparse matrix out of these and return it
    if verbose: print('Creating Sparse matrix from the computed similarities')
    #return rows, cols, data
    
    if draw_time_taken:
        plt.plot(time_taken, label = 'time taken for each user')
        plt.plot(np.cumsum(time_taken), label='Total time')
        plt.legend(loc='best')
        plt.xlabel('User')
        plt.ylabel('Time (seconds)')
        plt.show()
        
    return sparse.csr_matrix((data, (rows, cols)), shape=(no_of_users, no_of_users)), time_taken       

In [None]:
start = datetime.now()
u_u_sim_sparse, _ = compute_user_similarity(train_sparse_matrix, compute_for_few=True, top = 100,
                                                     verbose=True)
print("-"*100)
print("Time taken :",datetime.now()-start)

**Computing Movie-Movie similarity Matrix**

In [None]:
start = datetime.now()
if not os.path.isfile('m_m_sparse.npz'):
    print('It seems dont have a file. computing movie_movie smimilarity...')
    start = datetime.now()
    m_m_sim_sparse = cosine_similarity(X=train_sparse_matrix.T, dense_output = False)
    
    #store this sparse matrix in disk 
    #print('saving it to disk without the need of re-computing it again')
    #sparse.save_npz("m_m_sim_sparse.npz", m_m_sim_sparse)
else:
    print('it is there.')
    m_m_sim_sparse = sparse.load_npz("m_m_sim_sparse")
    
print("it is a ", m_m_sim_sparse.shape, "dimensional matrix")

print(datetime.now() - start)

We take only those top similar movie ratings and store them in a separate dictionary.

In [None]:
movie_ids = np.unique(m_m_sim_sparse.nonzero()[1])

start  = datetime.now()
similar_movies = dict()
for movie in movie_ids:
    sim_movies = m_m_sim_sparse[movie].toarray().ravel().argsort()[::-1][1:]
    similar_movies[movie] = sim_movies[:100]
print(datetime.now() - start)

#testing similar movies for movie_15
similar_movies[15]

Finding Most Similar Movie

In [None]:
movie_titles = pd.read_csv("../input/netflix-prize-data/movie_titles.csv", sep=',', header = None,
                           names=['movie_id', 'year_of_release', 'title'], verbose=True,
                      index_col = 'movie_id', encoding = "ISO-8859-1")

movie_titles.head()

In [None]:
mv_id = 36

print("\nMovie ----->",movie_titles.loc[mv_id].values[1])

print("\nIt has {} Ratings from users.".format(train_sparse_matrix[:,mv_id].getnnz()))

print("\nWe have {} movies which are similarto this  and we will get only top most..".format(m_m_sim_sparse[:,mv_id].getnnz()))

In [None]:
similarities = m_m_sim_sparse[mv_id].toarray().ravel()

similar_indices = similarities.argsort()[::-1][1:]

similarities[similar_indices]

sim_indices = similarities.argsort()[::-1][1:]

In [None]:
movie_titles.loc[sim_indices[:10]]

**Now using ML models(SVD model using surprise package)**
* reading the text files and combining them together in a single dataset "Data"

In [None]:
# write a function to read files to a dictionary
def read_file(file_path, nrows = 1000):
    datadict = {"User":[],"Movie":[],"Rating":[]}; # dictionary holder, no values
    file = open(file_path,"r"); # open file for reading
    count =1;
    for line in file:
        if count>nrows:
            break;
        if ":" in line:
            movie_id = line[:-2];
            movie_id = int(movie_id);
        else:
            user_id,rating,date = line.split(",");
            datadict["User"].append(user_id) ;
            datadict["Movie"].append(movie_id);
            datadict["Rating"].append(rating);
            # exclude date because we do not need it for prediction
        count +=1;
    file.close(); #close file after reading
    return pd.DataFrame(datadict); 

In [None]:
nRow = 50000;
filepath = "/kaggle/input/netflix-prize-data/combined_data_1.txt"
df1 = read_file(filepath, nrows = nRow)
filepath = "/kaggle/input/netflix-prize-data/combined_data_2.txt"
df2 = read_file(filepath, nrows = nRow)
filepath = "/kaggle/input/netflix-prize-data/combined_data_3.txt"
df3 = read_file(filepath, nrows = nRow)
filepath = "/kaggle/input/netflix-prize-data/combined_data_4.txt"
df4 = read_file(filepath, nrows = nRow)
# merge data to make a user-product data
Data = df1.append(df2)
Data=Data.append(df3)
Data.append(df4)
Data.head()

* Using surprise package to use SVD technique

In [None]:
#import the libraries from python surprise package
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

In [None]:
# create objects of Reader and SVD classes
reader = Reader()
svd = SVD()

Perform cross validation to check accuracy using evaluation metrics 

In [None]:
# prepare the trainig data set in the order of "item, user, rating"
train_data = Dataset.load_from_df(Data[['User', 'Movie', 'Rating']], reader)
# validate the svd model with cross_validate
cross_validate(svd,train_data,measures = ["RMSE","MAE"],cv = 5,verbose = True)

# build model on the entire data set
train_set = train_data.build_full_trainset()

# fit model on train set
svd.fit(train_set)

In [None]:
movie_titles = pd.read_csv("../input/netflix-prize-data/movie_titles.csv", sep=',', header = None,
                           names=['movie_id', 'year_of_release', 'title'], verbose=True, encoding = "ISO-8859-1")

movie_titles.head()

Finding the top similar movies by estimating their ranks(top values)

In [None]:
# use the model for recommendation for a userID, say userID = Data.iloc[0,0]
userID = 823519;#Data.iloc[0,0]; # example of a customer
movie_titles["EstimateRank"] = movie_titles["movie_id"].apply(lambda x: svd.predict(userID,x).est);
movie_titles=movie_titles.sort_values(by=["EstimateRank"], ascending = False)
movie_titles.head(10)