##Movies Only by Rating File

In [None]:
!wget 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
!unzip ml-1m.zip

In [13]:
import pandas as pd
import numpy as np
from collections import defaultdict
data_path = 'ml-1m/ratings.dat'
n_users = 6040
n_movies = 3706

def load_rating_data(data_path, n_users, n_movies):
  data = np.zeros([n_users, n_movies], dtype=np.float32)
  movie_id_mapping = {}
  movie_n_rating = defaultdict(int)
  with open(data_path, 'r') as file:
    for line in file.readlines()[1:]:
      user_id, movie_id, rating, _ = line.split("::")
      user_id = int(user_id) - 1
      if movie_id not in movie_id_mapping:
        movie_id_mapping[movie_id] = len(movie_id_mapping)
      rating = int(rating)
      data[user_id, movie_id_mapping[movie_id]] = rating
      if rating > 0:
        movie_n_rating[movie_id] += 1
  return data, movie_n_rating, movie_id_mapping

data, movie_n_rating, movie_id_mapping = load_rating_data(data_path, n_users, n_movies)


def display_distribution(data):
  values, counts = np.unique(data, return_counts=True)
  for value, count in zip(values, counts):
    print('Number of rating {}:{}'.format(int(value), count))

display_distribution(data)

movie_id_most, n_rating_most = sorted(movie_n_rating.items(), key=lambda d: d[1], reverse=True)[0]
print('Movie ID {} has {} ratings.'.format(movie_id_most,n_rating_most))

X_raw = np.delete(data, movie_id_mapping[movie_id_most], axis=1)
Y_raw = data[:, movie_id_mapping[movie_id_most]]
X = X_raw[Y_raw > 0]
Y = Y_raw[Y_raw > 0]
print('Shape of X:', X.shape)
print('Shape of Y:', Y.shape)
display_distribution(Y)
recommend = 3
Y[Y <= recommend] = 0
Y[Y > recommend] = 1
n_pos = (Y == 1).sum()
n_neg = (Y == 0).sum()
print('{} positive samples and {} negative samples.'.format(n_pos, n_neg))
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(len(Y_train), len(Y_test))
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)
prediction_prob = clf.predict_proba(X_test)
print(prediction_prob[0:10])
prediction = clf.predict(X_test)
print(prediction[:10])
accuracy = clf.score(X_test, Y_test)
print('The accuracy is: {}'.format(int(accuracy*1000)/10))

Number of rating 0:21384032
Number of rating 1:56174
Number of rating 2:107557
Number of rating 3:261197
Number of rating 4:348971
Number of rating 5:226309
Movie ID 2858 has 3428 ratings.
Shape of X: (3428, 3705)
Shape of Y: (3428,)
Number of rating 1:83
Number of rating 2:134
Number of rating 3:358
Number of rating 4:890
Number of rating 5:1963
2853 positive samples and 575 negative samples.
2742 686
[[7.50487439e-23 1.00000000e+00]
 [1.01806208e-01 8.98193792e-01]
 [3.57740570e-10 1.00000000e+00]
 [1.00000000e+00 2.94095407e-16]
 [1.00000000e+00 2.49760836e-25]
 [7.62630220e-01 2.37369780e-01]
 [3.47479627e-05 9.99965252e-01]
 [2.66075292e-11 1.00000000e+00]
 [5.88493563e-10 9.99999999e-01]
 [9.71326867e-09 9.99999990e-01]]
[1. 1. 1. 0. 0. 0. 1. 1. 1. 1.]
The accuracy is: 71.5


##Exercise 1

In [14]:
import pandas as pd
data_ratings_path = 'ml-1m/ratings.dat'
data_movies_path = 'ml-1m/movies.dat'
data_users_path = 'ml-1m/users.dat'
n_users = 6040
n_movies = 3706

data_ratings = pd.read_csv(data_ratings_path, delimiter = "::", names=['userId','movieId','rating','timestamp'], encoding = "ISO-8859-1")
data_movies = pd.read_csv(data_movies_path, delimiter = "::", names=['movieId','title','genres'], encoding = "ISO-8859-1")
data_users = pd.read_csv(data_users_path, delimiter = "::", names=['userId','gender','age','occupation', 'zipCode'], encoding = "ISO-8859-1")
data = pd.merge(pd.merge(data_ratings, data_movies),data_users)

data.drop(columns = ['timestamp', 'zipCode', 'title'], inplace=True)
mean_values = data[['gender', 'age', 'occupation', 'movieId', 'rating']].groupby(['gender', 'age', 'occupation', 'movieId']).mean()
gender_list = set(data_users['gender'].tolist())
age_list = set(data_users['age'].tolist())
occ_list = set(data_users['occupation'].tolist())
users = data_users.groupby(['gender', 'age', 'occupation', 'userId']).first().index

fi = 1
for gl in gender_list:
  for al in age_list:
    for ol in occ_list:
      temp_list = [item[3] for item in users if item[0] == gl and item[1] == al and item[2] == ol]
      if len(temp_list):
        temp_df = mean_values.loc[gl, al, ol]
        #print('{},{},{}:{},{}'.format(gl,al,ol,len(temp_list), len(temp_df.index)))
        tuples = [(u, m) for m in temp_df.index for u in temp_list]
        index = pd.MultiIndex.from_tuples(tuples, names=["userId", "movieId"])
        if fi:
          s = pd.Series(temp_df['rating'].tolist()*len(temp_list), index=index)
          fi = 0
        else:
          s = pd.concat([s, pd.Series(temp_df['rating'].tolist()*len(temp_list), index=index)])
print(s)

data.drop(columns = ['genres', 'gender', 'age', 'occupation'], inplace=True)
data.set_index(['userId', 'movieId'], inplace=True)
df = s.rename('rate').to_frame()
df['rate'] = df['rate'].apply(lambda x:int(round(x)))
del s
del data_ratings
del data_movies
del data_users

df = pd.concat([data, df], axis=1)
del data
df.rating.fillna(df.rate, inplace=True)
del df['rate']
df

  return func(*args, **kwargs)


userId  movieId
1187    1          4.250000
1926    1          2.500000
2045    1          3.000000
3234    1          5.000000
5296    1          4.000000
                     ...   
5787    3952       3.521739
5815    3952       3.857143
5886    3952       3.000000
5888    3952       3.000000
5897    3952       3.600000
Length: 12219191, dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,movieId,Unnamed: 2_level_1
1,1,5.0
1,2,4.0
1,3,5.0
1,4,4.0
1,5,1.0
...,...,...
6040,3945,2.0
6040,3947,4.0
6040,3948,4.0
6040,3949,4.0


In [15]:
# sort values by index
A = df.sort_index()

# fill na  
for idx in A.index.names:  
  A = A.unstack(idx).fillna(0).stack(1)

# create a tuple with the rights dimensions
reshape_size = tuple([len(x) for x in A.index.levels])

# reshape
data_input = np.reshape(A.values, reshape_size )#.swapaxes(0,1)

In [20]:
#movie_id_most, n_rating_most = sorted(movie_n_rating.items(), key=lambda d: d[1], reverse=True)[0]
movie_id_most = np.count_nonzero(data_input, axis=1).argmax()
#print('Movie ID {} has {} ratings.'.format(movie_id_most,n_rating_most))

X_raw = np.delete(data_input, movie_id_most, axis=1)
Y_raw = data_input[:, movie_id_most]
X = X_raw[Y_raw > 0]
Y = Y_raw[Y_raw > 0]
print('Shape of X:', X.shape)
print('Shape of Y:', Y.shape)
recommend = 3
Y[Y <= recommend] = 0
Y[Y > recommend] = 1
n_pos = (Y == 1).sum()
n_neg = (Y == 0).sum()
print('{} positive samples and {} negative samples.'.format(n_pos, n_neg))
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(len(Y_train), len(Y_test))
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)
prediction_prob = clf.predict_proba(X_test)
print(prediction_prob[0:10])
prediction = clf.predict(X_test)
print(prediction[:10])
accuracy = clf.score(X_test, Y_test)
print('The accuracy is: {}'.format(int(accuracy*1000)/10))

Shape of X: (5773, 3705)
Shape of Y: (5773,)
3057 positive samples and 2716 negative samples.
4618 1155
[[1.00000000e+000 1.35774388e-081]
 [1.04655665e-041 1.00000000e+000]
 [6.46917742e-051 1.00000000e+000]
 [4.41963009e-033 1.00000000e+000]
 [1.00000000e+000 1.15984346e-024]
 [1.00000000e+000 1.13015189e-055]
 [1.00000000e+000 1.49898297e-064]
 [1.00000000e+000 9.61929491e-138]
 [3.52069634e-055 1.00000000e+000]
 [7.16812260e-060 1.00000000e+000]]
[0. 1. 1. 1. 0. 0. 0. 0. 1. 1.]
The accuracy is: 56.3
