<a href="https://colab.research.google.com/github/rodolfoarruda/MachineLearning/blob/main/recom_sys_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Rodolfo Arruda - 6381848

### **SCC5966 – Sistemas de Recomendação**

## **Setup**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import numpy as np
np.set_printoptions(suppress=True)

import pandas as pd
import matplotlib.pyplot as plt

# calculate sparsity
from numpy import array
from numpy import count_nonzero

# calculate similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances

# Split train test
from sklearn.model_selection import train_test_split

## **1 - Data Preparation**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/train_data.csv',sep=',')
df.head()

In [None]:
plt.bar(df['rating'].value_counts().index,df['rating'].value_counts())
plt.title('Ratings Distribution')
plt.xlabel('Rating')
plt.ylabel('# Evaluations');

In [None]:
df['rating'].mean()

#### **1.1 - Dummy submission - by average movie**

In [None]:
avg_movie = pd.DataFrame(df['rating'].groupby(df['movie_id']).mean())
avg_movie.reset_index(inplace=True)

In [None]:
avg_movie.head()

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/test_data.csv',sep=',')
df_test.head()

In [None]:
df_test.count()

In [None]:
pred_dummy_movie = pd.merge(df_test, avg_movie, on="movie_id",how="left").fillna(4)

In [None]:
pred_dummy_movie.head()

In [None]:
pred = pred_dummy_movie[['id','rating']]
pred.head()

In [None]:
pred.to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/s1.csv',index=False)

#### **1.2 - Dummy submission - by average user**

In [None]:
  avg_user= pd.DataFrame(df['rating'].groupby(df['user_id']).mean())
  avg_user.reset_index(inplace=True)

In [None]:
avg_user.head()

In [None]:
pred_dummy_user = pd.merge(df_test, avg_user, on="user_id",how="left").fillna(3.603814223642363)

In [None]:
pred_dummy_user.count()

In [None]:
pred = pred_dummy_user[['id','rating']]
pred.head()

In [None]:
pred.to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/s2.csv',index=False)

## **2 - Claborative Filtering based on movie**


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/train_data.csv',sep=',')
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1160,5,974769817
1,1,1129,3,974769817
2,1,3328,4,974769817
3,1,2659,2,974769817
4,1,980,3,974769817


#### **2.1 - Data Normalization**

In [None]:
def sub_mean(df):
  ## Normalize rating by movie
  avg_movie = pd.DataFrame(df['rating'].groupby(df['movie_id']).mean())
  avg_movie = avg_movie.rename(columns = {'rating': 'avg_movie'})
  avg_movie.reset_index(inplace=True)
  result1 = pd.merge(df, avg_movie, on="movie_id")
  result1['rating_avgr_movie'] = result1['rating'] - result1['avg_movie']

  ## Normalize rating by user
  avg_user= pd.DataFrame(df['rating'].groupby(df['user_id']).mean())
  avg_user = avg_user.rename(columns = {'rating': 'avg_user'})
  avg_user.reset_index(inplace=True)
  result2 = pd.merge(result1, avg_user, on="user_id")
  result2['rating_avgr_user'] = result2['rating'] - result2['avg_user']

  return result2

In [None]:
df_norm = sub_mean(df)

In [None]:
df_norm.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie,rating_avgr_movie,avg_user,rating_avgr_user
0,1,1160,5,974769817,3.937198,1.062802,3.769231,1.230769
1,1,1129,3,974769817,3.99332,-0.99332,3.769231,-0.769231
2,1,3328,4,974769817,3.662202,0.337798,3.769231,0.230769
3,1,2659,2,974769817,3.688333,-1.688333,3.769231,-1.769231
4,1,980,3,974769817,3.927287,-0.927287,3.769231,-0.769231


#### **2.2 - user x item matrix**


In [None]:
A = df_norm.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

In [None]:
A

movie_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,3525,3526,3527,3528,3529,3530,3531,3532,3533,3534,3535,3536,3537,3538,3539,3540,3541,3542,3543,3544,3545,3546,3547,3548,3549,3550,3551,3552,3553,3554,3555,3556,3557,3558,3559,3560,3561,3562,3563,3564
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### **2.3 - Sparcity evaluation**
##### The sparsity of a matrix can be quantified with a score, which is the number of zero values in the matrix divided by the total number of elements in the matrix.

In [None]:
sparsity = 1.0 - count_nonzero(A) /A.size
print(sparsity)

0.9619391144037263


##### A dense matrix stored in a NumPy array can be converted into a sparse matrix using the CSR representation by calling the csr_matrix() function.

In [None]:
from scipy import sparse

In [None]:
train = sparse.csr_matrix((df_norm.rating_avgr_movie, (df_norm.user_id, df_norm.movie_id)))

#### **2.4 - Compute similar movies**

##### A similarity matrix is critical to measure and calculate the similarity between user-profiles and movies to generate recommendations. To remove movie and user bias, we need to re-escale ratings base on average. 

In [None]:
similarity = cosine_similarity(train.T, dense_output = False)

In [None]:
print(similarity)

In [None]:
# Reference rating parameters
avg_movie = pd.DataFrame(df_norm['rating'].groupby(df_norm['movie_id']).mean())
avg_movie.reset_index(inplace=True)

# Reference movies
moviex=df_norm['movie_id'].unique()

In [None]:
#moviex = [1160, 1129, 3328]

#### **2.5 - Compute top similar movies**

In [None]:
def sim_knearb(movie,k,similarity):
  y =pd.DataFrame(np.matrix(sparse.find(similarity)).T,columns=['similar','base','w'])
  z = y[y['base'] != y['similar']]

  return z[z['base'].isin([movie])].sort_values(by='w',ascending=False).head(k)

In [None]:
def avg_knearb(moviex,k,similarity):
  
  # auxiliar variables
  j = 0
  aux  = {'similar': [0.0], 'base': [0], 'w':[0.0]}
  base = pd.DataFrame(aux, columns = ['similar','base','w'])
 
  for i in moviex:
 
    top = sim_knearb(i , k , similarity)
    base = pd.concat([base, top])

    j += 1
    print('Iteração #:', j)
    
  base['sub_group_rank'] = base.groupby('base')['w'].rank(ascending=False)
  
  
  return base

In [None]:
base = avg_knearb(moviex,10,similarity)

In [None]:
base.head()

Unnamed: 0,similar,base,w,sub_group_rank
0,0.0,0.0,0.0,1.0
2970687,1112.0,1160.0,0.293818,1.0
2971797,2368.0,1160.0,0.267087,2.0
2971799,2370.0,1160.0,0.214225,3.0
2971802,2373.0,1160.0,0.211034,4.0


In [None]:
base.to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/pto_checagem_sim2.csv',index=False)

#### **2.6 - Predictions**

In [None]:
base = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/pto_checagem_sim2.csv')

In [None]:
base.head()

Unnamed: 0,similar,base,w,sub_group_rank
0,0.0,0.0,0.0,1.0
1,1112.0,1160.0,0.293818,1.0
2,2368.0,1160.0,0.267087,2.0
3,2370.0,1160.0,0.214225,3.0
4,2373.0,1160.0,0.211034,4.0


In [None]:
base.count()

similar           34021
base              34021
w                 34021
sub_group_rank    34021
dtype: int64

In [None]:
 base = base[base['sub_group_rank'] <=6]

In [None]:
df_norm.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie,rating_avgr_movie,avg_user,rating_avgr_user
0,1,1160,5,974769817,3.937198,1.062802,3.769231,1.230769
1,1,1129,3,974769817,3.99332,-0.99332,3.769231,-0.769231
2,1,3328,4,974769817,3.662202,0.337798,3.769231,0.230769
3,1,2659,2,974769817,3.688333,-1.688333,3.769231,-1.769231
4,1,980,3,974769817,3.927287,-0.927287,3.769231,-0.769231


In [None]:
x_train, x_test,= train_test_split(df_norm, test_size=0.3, random_state=0)

In [None]:
x_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie,rating_avgr_movie,avg_user,rating_avgr_user
71608,227,34,5,974738104,3.884694,1.115306,3.357553,1.642447
276128,2979,171,2,966267076,3.577406,-1.577406,3.503378,-1.503378
270463,2920,967,4,965277713,4.065617,-0.065617,3.645,0.355
263232,2784,1866,3,965340485,3.051471,-0.051471,3.320099,-0.320099
234148,2457,2007,4,965941989,3.571429,0.428571,3.951027,0.048973


In [None]:
avg_movie = pd.DataFrame(x_train['rating'].groupby(df_norm['movie_id']).mean())
avg_movie.reset_index(inplace=True)

In [None]:
#x_test_min = x_test[['user_id','movie_id','rating']]

x_test_min = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/test_data.csv')

In [None]:
x_test_min.head()

Unnamed: 0,id,user_id,movie_id,timestamp
0,0,5,2962,974769784
1,1,5,3177,974769768
2,2,5,3153,974769768
3,3,5,501,974769768
4,4,5,3159,974769768


In [None]:
pred_movie_avg = pd.merge(x_test_min,avg_movie, on="movie_id",how="left").fillna(3.603814223642363)

In [None]:
pred_movie_avg.head()


Unnamed: 0,id,user_id,movie_id,timestamp,rating
0,0,5,2962,974769784,3.637931
1,1,5,3177,974769768,3.637931
2,2,5,3153,974769768,3.637931
3,3,5,501,974769768,3.637931
4,4,5,3159,974769768,3.637931


In [None]:
pred_movie_avg2 = pd.merge(pred_movie_avg,base, how='left',left_on=['movie_id'],right_on=['base'])

In [None]:
pred_movie_avg2.head()

Unnamed: 0,id,user_id,movie_id,timestamp,rating,similar,base,w,sub_group_rank
0,0,5,2962,974769784,3.637931,104.0,5.0,0.51903,1.0
1,0,5,2962,974769784,3.637931,725.0,5.0,0.497589,2.0
2,0,5,2962,974769784,3.637931,112.0,5.0,0.478498,3.0
3,0,5,2962,974769784,3.637931,611.0,5.0,0.460403,4.0
4,0,5,2962,974769784,3.637931,360.0,5.0,0.439991,5.0


In [None]:
df_norm[['rating_avgr_movie','user_id','movie_id']].dtypes

rating_avgr_movie    float64
user_id                int64
movie_id               int64
dtype: object

id                  int64
user_id             int64
movie_id            int64
timestamp           int64
rating            float64
similar             int64
base              float64
w                 float64
sub_group_rank    float64
dtype: object

In [None]:
pred_movie_avg3 = pd.merge(pred_movie_avg2,df_norm[['rating_avgr_movie','user_id','movie_id']],\ how='left',left_on=['user_id','similar'],right_on=['user_id','movie_id']).fillna(0)

In [None]:
pred_movie_avg3.head()

Unnamed: 0,id,user_id_x,movie_id,timestamp,rating,similar,base,w,sub_group_rank,avg_user,user_id_y
0,0,5,2962,974769784,3.637931,104,5.0,0.51903,1.0,0.0,0.0
1,0,5,2962,974769784,3.637931,725,5.0,0.497589,2.0,0.0,0.0
2,0,5,2962,974769784,3.637931,112,5.0,0.478498,3.0,3.111111,112.0
3,0,5,2962,974769784,3.637931,611,5.0,0.460403,4.0,0.0,0.0
4,0,5,2962,974769784,3.637931,360,5.0,0.439991,5.0,0.0,0.0


In [None]:
pred_movie_avg3['avg_pond'] = pred_movie_avg3['w']* pred_movie_avg3['rating_avgr_movie']

In [None]:
pred_movie_avg3.head()

Unnamed: 0,id,user_id_x,movie_id,timestamp,rating,similar,base,w,sub_group_rank,avg_user,user_id_y,avg_pond
0,0,5,2962,974769784,3.637931,104,5.0,0.51903,1.0,0.0,0.0,0.0
1,0,5,2962,974769784,3.637931,725,5.0,0.497589,2.0,0.0,0.0,0.0
2,0,5,2962,974769784,3.637931,112,5.0,0.478498,3.0,3.111111,112.0,1.48866
3,0,5,2962,974769784,3.637931,611,5.0,0.460403,4.0,0.0,0.0,0.0
4,0,5,2962,974769784,3.637931,360,5.0,0.439991,5.0,0.0,0.0,0.0


In [None]:
pred_movie_avg3.count()

id                23250
user_id_x         23250
movie_id          23250
timestamp         23250
rating            23250
similar           23250
base              23250
w                 23250
sub_group_rank    23250
avg_user          23250
user_id_y         23250
avg_pond          23250
dtype: int64

In [None]:
pred_movie_avg3.to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/escoragem_v3.csv',index=False)

## **3 - Claborative filtering based on user**


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/train_data.csv',sep=',')
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1160,5,974769817
1,1,1129,3,974769817
2,1,3328,4,974769817
3,1,2659,2,974769817
4,1,980,3,974769817


#### **3.1 - Data Normalization**

In [None]:
def sub_mean(df):
  ## Normalize rating by movie
  avg_movie = pd.DataFrame(df['rating'].groupby(df['movie_id']).mean())
  avg_movie = avg_movie.rename(columns = {'rating': 'avg_movie'})
  avg_movie.reset_index(inplace=True)
  result1 = pd.merge(df, avg_movie, on="movie_id")
  result1['rating_avgr_movie'] = result1['rating'] - result1['avg_movie']

  ## Normalize rating by user
  avg_user= pd.DataFrame(df['rating'].groupby(df['user_id']).mean())
  avg_user = avg_user.rename(columns = {'rating': 'avg_user'})
  avg_user.reset_index(inplace=True)
  result2 = pd.merge(result1, avg_user, on="user_id")
  result2['rating_avgr_user'] = result2['rating'] - result2['avg_user']

  return result2

In [None]:
df_norm = sub_mean(df)

In [None]:
df_norm.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie,rating_avgr_movie,avg_user,rating_avgr_user
0,1,1160,5,974769817,3.937198,1.062802,3.769231,1.230769
1,1,1129,3,974769817,3.99332,-0.99332,3.769231,-0.769231
2,1,3328,4,974769817,3.662202,0.337798,3.769231,0.230769
3,1,2659,2,974769817,3.688333,-1.688333,3.769231,-1.769231
4,1,980,3,974769817,3.927287,-0.927287,3.769231,-0.769231


#### **3.2 - item x user matrix**


In [None]:
A = df_norm.pivot(index='movie_id', columns='user_id', values='rating').fillna(0)

In [None]:
A

user_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,3934,3936,3937,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952,3953,3954,3955,3956,3957,3958,3959,3960,3961,3962,3963,3964,3965,3966,3967,3968,3969,3970,3971,3972,3973,3974
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3560,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3561,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### **3.3 - Sparcity evaluation**
##### The sparsity of a matrix can be quantified with a score, which is the number of zero values in the matrix divided by the total number of elements in the matrix.

In [None]:
sparsity = 1.0 - count_nonzero(A) /A.size
print(sparsity)

0.9619391144037263


##### A dense matrix stored in a NumPy array can be converted into a sparse matrix using the CSR representation by calling the csr_matrix() function.

In [None]:
from scipy import sparse

In [None]:
train = sparse.csr_matrix((df_norm.rating, (df_norm.movie_id,df_norm.user_id)))

#### **3.4 - Compute similar user**

##### A similarity matrix is critical to measure and calculate the similarity between user-profiles and movies to generate recommendations. To remove movie and user bias, we need to re-escale ratings base on average. 

In [None]:
similarity = cosine_similarity(train.T, dense_output = False)

In [None]:
print(similarity)

In [None]:
# Reference rating parameters
avg_movie = pd.DataFrame(df_norm['rating'].groupby(df_norm['user_id']).mean())
avg_movie.reset_index(inplace=True)

# Reference movies
userx=df_norm['user_id'].unique()

In [None]:
userx

array([   1,   32,  107, ..., 3943, 3851, 3933])

In [None]:
#userx = [3943, 3851, 3933]

#### **3.5 - Compute top similar movies**

In [None]:
def sim_knearb(user,k,similarity):
  y =pd.DataFrame(np.matrix(sparse.find(similarity)).T,columns=['similar','base','w'])
  z = y[y['base'] != y['similar']]

  return z[z['base'].isin([user])].sort_values(by='w',ascending=False).head(k)

In [None]:
def avg_knearb(moviex,k,similarity):
  
  # auxiliar variables
  j = 0
  aux  = {'similar': [0.0], 'base': [0], 'w':[0.0]}
  base = pd.DataFrame(aux, columns = ['similar','base','w'])
 
  for i in userx:
 
    top = sim_knearb(i , k , similarity)
    base = pd.concat([base, top])

    j += 1
    print('Iteração #:', j)
    
  base['sub_group_rank'] = base.groupby('base')['w'].rank(ascending=False)
  
  
  return base

In [None]:
base = avg_knearb(userx,10,similarity)

In [None]:
base.head()

Unnamed: 0,similar,base,w,sub_group_rank
0,0.0,0.0,0.0,1.0
1669,1988.0,1.0,0.65691,1.0
2340,2743.0,1.0,0.550155,2.0
3184,3722.0,1.0,0.549234,3.0
2420,2834.0,1.0,0.511894,4.0


In [None]:
base.to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/pto_checagem_user.csv',index=False)

#### **3.6 - Predictions**

In [None]:
base = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/pto_checagem_user.csv')

In [None]:
base.head()

Unnamed: 0,similar,base,w,sub_group_rank
0,0.0,0.0,0.0,1.0
1,1988.0,1.0,0.65691,1.0
2,2743.0,1.0,0.550155,2.0
3,3722.0,1.0,0.549234,3.0
4,2834.0,1.0,0.511894,4.0


In [None]:
base.count()

similar           39521
base              39521
w                 39521
sub_group_rank    39521
dtype: int64

In [None]:
 base = base[base['sub_group_rank'] <=6]

In [None]:
df_norm.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie,rating_avgr_movie,avg_user,rating_avgr_user
0,1,1160,5,974769817,3.937198,1.062802,3.769231,1.230769
1,1,1129,3,974769817,3.99332,-0.99332,3.769231,-0.769231
2,1,3328,4,974769817,3.662202,0.337798,3.769231,0.230769
3,1,2659,2,974769817,3.688333,-1.688333,3.769231,-1.769231
4,1,980,3,974769817,3.927287,-0.927287,3.769231,-0.769231


In [None]:
x_train, x_test,= train_test_split(df_norm, test_size=0.3, random_state=0)

In [None]:
x_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie,rating_avgr_movie,avg_user,rating_avgr_user
71608,227,34,5,974738104,3.884694,1.115306,3.357553,1.642447
276128,2979,171,2,966267076,3.577406,-1.577406,3.503378,-1.503378
270463,2920,967,4,965277713,4.065617,-0.065617,3.645,0.355
263232,2784,1866,3,965340485,3.051471,-0.051471,3.320099,-0.320099
234148,2457,2007,4,965941989,3.571429,0.428571,3.951027,0.048973


In [None]:
avg_movie = pd.DataFrame(df_norm['rating'].groupby(df_norm['user_id']).mean())
avg_movie.reset_index(inplace=True)

In [None]:
avg_movie.head()

Unnamed: 0,user_id,rating
0,1,3.769231
1,2,3.428571
2,3,3.818182
3,4,4.375
4,5,3.637931


In [None]:
#x_test_min = x_test[['user_id','movie_id','rating']]

x_test_min = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/test_data.csv')

In [None]:
x_test_min.head()

Unnamed: 0,id,user_id,movie_id,timestamp
0,0,5,2962,974769784
1,1,5,3177,974769768
2,2,5,3153,974769768
3,3,5,501,974769768
4,4,5,3159,974769768


In [None]:
pred_movie_avg = pd.merge(x_test_min,avg_movie, on="user_id",how="left").fillna(3.603814223642363)

In [None]:
pred_movie_avg.head()


Unnamed: 0,id,user_id,movie_id,timestamp,rating
0,0,5,2962,974769784,3.637931
1,1,5,3177,974769768,3.637931
2,2,5,3153,974769768,3.637931
3,3,5,501,974769768,3.637931
4,4,5,3159,974769768,3.637931


In [None]:
pred_movie_avg2 = pd.merge(pred_movie_avg,base, how='left',left_on=['user_id'],right_on=['base'])

In [None]:
pred_movie_avg2.head()

Unnamed: 0,id,user_id,movie_id,timestamp,rating,similar,base,w,sub_group_rank
0,0,5,2962,974769784,3.637931,104.0,5.0,0.51903,1.0
1,0,5,2962,974769784,3.637931,725.0,5.0,0.497589,2.0
2,0,5,2962,974769784,3.637931,112.0,5.0,0.478498,3.0
3,0,5,2962,974769784,3.637931,611.0,5.0,0.460403,4.0
4,0,5,2962,974769784,3.637931,360.0,5.0,0.439991,5.0


In [None]:
df_norm[['rating_avgr_movie','user_id','movie_id']].dtypes

rating_avgr_movie    float64
user_id                int64
movie_id               int64
dtype: object

In [None]:
pred_movie_avg2['similar'] = pred_movie_avg2['similar'].fillna(0).astype(int)
pred_movie_avg2.dtypes

id                  int64
user_id             int64
movie_id            int64
timestamp           int64
rating            float64
similar             int64
base              float64
w                 float64
sub_group_rank    float64
dtype: object

In [None]:
pred_movie_avg3 = pd.merge(pred_movie_avg2,df_norm[['rating_avgr_user','user_id','movie_id']],\
                           how='left',left_on=['movie_id','similar'],right_on=['movie_id','user_id']).fillna(0)

In [None]:
pred_movie_avg3.head()

Unnamed: 0,id,user_id_x,movie_id,timestamp,rating,similar,base,w,sub_group_rank,rating_avgr_user,user_id_y
0,0,5,2962,974769784,3.637931,104,5.0,0.51903,1.0,0.0,0.0
1,0,5,2962,974769784,3.637931,725,5.0,0.497589,2.0,0.0,0.0
2,0,5,2962,974769784,3.637931,112,5.0,0.478498,3.0,-2.111111,112.0
3,0,5,2962,974769784,3.637931,611,5.0,0.460403,4.0,0.0,0.0
4,0,5,2962,974769784,3.637931,360,5.0,0.439991,5.0,0.0,0.0


In [None]:
pred_movie_avg3['avg_pond'] = pred_movie_avg3['w']* pred_movie_avg3['rating_avgr_user']

In [None]:
pred_movie_avg3.head()

Unnamed: 0,id,user_id_x,movie_id,timestamp,rating,similar,base,w,sub_group_rank,rating_avgr_user,user_id_y,avg_pond
0,0,5,2962,974769784,3.637931,104,5.0,0.51903,1.0,0.0,0.0,0.0
1,0,5,2962,974769784,3.637931,725,5.0,0.497589,2.0,0.0,0.0,0.0
2,0,5,2962,974769784,3.637931,112,5.0,0.478498,3.0,-2.111111,112.0,-1.010162
3,0,5,2962,974769784,3.637931,611,5.0,0.460403,4.0,0.0,0.0,0.0
4,0,5,2962,974769784,3.637931,360,5.0,0.439991,5.0,0.0,0.0,0.0


In [None]:
pred_movie_avg3.count()

id                  23250
user_id_x           23250
movie_id            23250
timestamp           23250
rating              23250
similar             23250
base                23250
w                   23250
sub_group_rank      23250
rating_avgr_user    23250
user_id_y           23250
avg_pond            23250
dtype: int64

In [None]:
pred_movie_avg3.to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/escoragem_v4.csv',index=False)

## **4 - Baseline**

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/train_data.csv',sep=',')
df.head()

x_train, x_test,= train_test_split(df, test_size=0.3, random_state=0)

In [3]:
x_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
71608,697,2348,1,974754467
276128,2214,1351,4,967388817
270463,2180,1134,1,967022501
263232,2143,742,3,967234398
234148,1953,1708,4,967919937


In [4]:
global_mean = x_train['rating'].mean()
global_mean

3.6036080715001813

In [20]:
def sub_mean(df):

  ## Global mean
  global_mean = df['rating'].mean()

  ## Normalize rating by movie
  avg_movie = pd.DataFrame((df['rating']-global_mean).groupby(df['movie_id']).mean())
  avg_movie = avg_movie.rename(columns = {'rating': 'avg_movie'})
  avg_movie.reset_index(inplace=True)
  
  return global_mean, avg_movie

def sub_mean2(df,global_mean):
  ## Normalize rating by user
  avg_user= pd.DataFrame((df['rating'] - global_mean- df['avg_movie']).groupby(df['user_id']).mean())
  avg_user = avg_user.rename(columns = {0: 'avg_user'})
  avg_user.reset_index(inplace=True)

  return avg_user

In [21]:
global_mean, avg_movie = sub_mean(x_train)

In [22]:
x_train1 = pd.merge(x_train, avg_movie, on="movie_id")

In [23]:
x_train1.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie
0,697,2348,1,974754467,-1.237629
1,1801,2348,1,968814334,-1.237629
2,2402,2348,3,966068330,-1.237629
3,2921,2348,2,965277359,-1.237629
4,648,2348,2,974677635,-1.237629


In [24]:
avg_user = sub_mean2(x_train1,global_mean)

In [26]:
avg_user.head()

Unnamed: 0,user_id,avg_user
0,1,-0.149574
1,2,-0.182651
2,3,0.237893
3,4,0.346358
4,5,-0.00886


In [29]:
x_train2 = pd.merge(x_train1, avg_user, on="user_id")

In [31]:
x_train2['predict'] = global_mean + x_train2['avg_user'] + x_train2['avg_movie']

In [32]:
x_train2.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie,avg_user,predict
0,697,2348,1,974754467,-1.237629,-0.087227,2.278753
1,697,2924,4,974756147,0.102836,-0.087227,3.619217
2,697,2286,5,974755277,0.742121,-0.087227,4.258502
3,697,1690,2,974751475,-0.436941,-0.087227,3.07944
4,697,562,5,974755555,0.30397,-0.087227,3.820352


In [34]:
((x_train2['rating'] - x_train2['predict']) ** 2).mean() ** .5

0.8892404516947805

In [53]:
global_mean

3.6036080715001813

In [52]:
# Estimativas
#global_mean = 3.6036080715001813
avg_user.to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/baseline_avg_user.csv',index=False)
avg_movie.to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/baseline_avg_movie.csv',index=False)

#### **4.1 - Predictions**

In [44]:
avg_user['avg_user'].mean()

0.025793103889861925

In [47]:
df_test1 = pd.merge(x_test, avg_movie, on="movie_id",how='left').fillna(avg_movie['avg_movie'].mean())
df_test2 = pd.merge(df_test1, avg_user, on="user_id",how='left').fillna(avg_user['avg_user'].mean())
# Predições
df_test2['predict'] = global_mean + df_test2['avg_user'] + df_test2['avg_movie']
df_test2.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_movie,avg_user,predict
0,3142,2842,3,964983390,0.172511,-0.31159,3.464529
1,316,3531,2,974710002,-1.300578,0.442251,2.745282
2,1425,1675,5,972958612,0.648728,0.291974,4.54431
3,3022,2027,5,965165835,0.156857,0.3441,4.104565
4,2218,1775,5,966694023,0.73741,-0.268923,4.072095


In [48]:
((df_test2['rating'] - df_test2['predict']) ** 2).mean() ** .5

0.9088475951084722

In [40]:
df_valid = pd.read_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/test_data.csv',sep=',')
df_valid.count()

id           3970
user_id      3970
movie_id     3970
timestamp    3970
dtype: int64

In [49]:
df_valid1 = pd.merge(df_valid, avg_movie, on="movie_id",how='left').fillna(avg_movie['avg_movie'].mean())
df_valid2 = pd.merge(df_valid1, avg_user, on="user_id",how='left').fillna(avg_user['avg_user'].mean())
# Predições
df_valid2['predict'] = global_mean + df_valid2['avg_user'] + df_valid2['avg_movie']
df_valid2.head()

Unnamed: 0,id,user_id,movie_id,timestamp,avg_movie,avg_user,predict
0,0,5,2962,974769784,-0.152741,-0.00886,3.442007
1,1,5,3177,974769768,-0.298675,-0.00886,3.296073
2,2,5,3153,974769768,-0.736941,-0.00886,2.857807
3,3,5,501,974769768,0.018014,-0.00886,3.612761
4,4,5,3159,974769768,-0.567894,-0.00886,3.026854


In [50]:
df_valid2[['id','predict']].to_csv('/content/drive/MyDrive/Doutorado/disciplinas/recom_sys/scc5966/baseline_v2.csv',index=False)

## **5 - Gradiente**

### **10 - Results**

##### Movie Filter colaborative wiht K = 4: RMSE 1.04849
##### Movie Filter colaborative wiht K = 6: RMSE 1.04461
##### Average between Movie Filter colaborative wiht K = 4 and dummy avg by user: RMSE 1.00042
##### Average between Movie Filter colaborative wiht K = 6 and dummy avg by user: RMSE 0.99707
##### user Filter colaborative wiht K = 6: RMSE 1.12605
##### user Filter colaborative wiht K = 3: RMSE 1.14487
##### baseline: RMSE 0.95653



## **References**

##### https://machinelearningmastery.com/sparse-matrices-for-machine-learning/

##### https://pub.towardsai.net/recommendation-system-in-depth-tutorial-with-python-for-netflix-using-collaborative-filtering-533ff8a0e444