In [1]:
import pandas as pd
import numpy as np

In [2]:
################################################################################################################
#                   This is to create a csv file(train.csv) by combining all the four files                    #
################################################################################################################
# # We re reading from each of the four files and appendig each rating to a global file 'train.csv'
# train = open('train.csv', mode='a')

# row = list()
# files=['netflix/combined_data_1.txt','netflix/combined_data_2.txt', 
#        'netflix/combined_data_3.txt', 'netflix/combined_data_4.txt']
# for file in files:
#     with open(file) as f:
#         for line in f:
#             del row[:]
#             line = line.strip()
#             if line.endswith(':'):
#                 movie_id = line.replace(':', '')
#             else:
#                 row = [x for x in line.split(',')]
#                 row.insert(0, movie_id)
#                 train.write(','.join(row))
#                 train.write('\n')
# train.close()

In [3]:
train_df = pd.read_csv('train.csv', sep=',', header=None, names=['movie','user','rating', 'date'])

In [4]:
train_df.head()

Unnamed: 0,movie,user,rating,date
0,1,1488844,3.0,2005-09-06
1,1,822109,5.0,2005-05-13
2,1,885013,4.0,2005-10-19
3,1,30878,4.0,2005-12-26
4,1,823519,3.0,2004-05-03


In [5]:
import matplotlib
matplotlib.use('nbagg')

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('whitegrid')

In [6]:
def human(num, units = 'M'):
    units = units.lower()
    num = float(num)
    if units == 'k':
        return str(num/10**3) + " K"
    elif units == 'm':
        return str(num/10**6) + " M"
    elif units == 'b':
        return str(num/10**9) +  " B"

In [7]:
fig, ax = plt.subplots()
plt.title('Distribution of ratings over entire dataset', fontsize=15)
sns.countplot(train_df.rating)
ax.set_yticklabels([human(item, 'M') for item in ax.get_yticks()])
ax.set_ylabel('No. of Ratings(Millions)')

plt.show()

<IPython.core.display.Javascript object>

movies = train_df.movie.value_counts().sort_index()
users = train_df.user.value_counts().sort_index()
ratings = train_df.rating

print("\nTotal no of ratings :",train_df.shape[0])
print("Total No of Users   :", len(users))
print("Total No of movies  :", len(movies))

## For some movies, some users entries(ratings and timestamp) are not present 

In [8]:
null_df = train_df.isnull()
null_df.sum()

movie     0
user      0
rating    2
date      2
dtype: int64

In [9]:
# lets try to find the indices of that nan values and print the rows..
null_rows = null_df.any(axis=1)
null_indices = np.nonzero(null_rows)[0].tolist()
train_df.iloc[null_indices]

Unnamed: 0,movie,user,rating,date
95880696,16992,962,,
95963294,17002,51082,,


In [10]:
train_df.drop(train_df.index[null_indices], inplace=True)
train_df.iloc[null_indices]

Unnamed: 0,movie,user,rating,date
95880697,16992,506241,5.0,2004-01-08
95963296,17002,2263311,3.0,2005-06-01


 Those entries are gone now...

## Creating sparse matrix for user-movie-rating from dataframe 

In [11]:
from scipy.sparse import csr_matrix
# csr_matrix(data_values, (row_index, col_index), shape_of_matrix)
# It should be in such a way that, MATRIX[row, col] = data
sparse_matrix = csr_matrix((train_df.rating.tolist(), (train_df.user.tolist(),
                                               train_df.movie.tolist())), 
                           shape=(max(train_df.user)+1,max(train_df.movie)+1))
print('(user, movie) : ',sparse_matrix.shape)

(user, movie) :  (2649430, 17771)


In [12]:
# x : row indices(userIds) that has an entry(rating to some movie)
# y : column indices(movieIds) that has an entry(rated by some user)
#  One entry from both (x and y) is a non-zero entry....
x,y = sparse_matrix.nonzero()


# ".A1" is for converting Column matrix to 1-D numpy array 
sum_of_ratings_per_user = sparse_matrix.sum(axis=1).A1
sum_of_ratings_per_user = sum_of_ratings_per_user[sum_of_ratings_per_user!=0]
# no of ratings that each user has given.
ratings_per_user = np.bincount(x)
ratings_per_user = ratings_per_user[ratings_per_user!=0]


# sum of the ratings that a movie got by any user(who rated that movie..)
sum_of_ratings_per_movie = sparse_matrix.sum(axis=0).A1[1:]
# sum_of_ratings_per_movie = sum_of_ratings_per_movie[sum_of_ratings_per_movie!=0]
# no of ratings that a movie got.
ratings_per_movie = np.bincount(y)[1:]  #there is no movie that has id=0, but sparse index starts at 0
# ratings_per_movie = ratings_per_movie[ratings_per_movie!=0]

# calculate AVERAGE Rating w.r.t user and movie.
avg_rating_per_movie = (sum_of_ratings_per_movie) / (ratings_per_movie)
avg_rating_per_user = (sum_of_ratings_per_user) / (ratings_per_user)

In [35]:
# draw pdfs for average rating per user and average
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=plt.figaspect(.5))
fig.suptitle('Avg Ratings per User and per Movie', fontsize=15)

ax1.set_title('Users-Avg-Ratings')
sns.distplot(avg_rating_per_user, ax=ax1, hist=False, 
             kde_kws=dict(cumulative=True), label='Cdf')
sns.distplot(avg_rating_per_user, ax=ax1, hist=False,label='Pdf')

ax2.set_title('Movies-Avg-Rating')
sns.distplot(avg_rating_per_movie, ax=ax2, hist=False, 
             kde_kws=dict(cumulative=True), label='Cdf')
sns.distplot(avg_rating_per_movie, ax=ax2, hist=False, label='Pdf')

plt.show()

<IPython.core.display.Javascript object>