# ALS [Alternating Least Square]

## Building Recommendation system using ALS on MovieLens Dataset

import the required libraries

In [1]:
import pandas as pd

In [2]:

ratings = pd.read_csv(r"C:\Users\I324158\OneDrive - SAP SE\Desktop\ML\IIITB\Dataset\Capstone\Recommendation_ex\User-based Collaborative Filtering\ratings.csv")

In [3]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
ratings.shape

(100000, 4)

In [5]:
ratings.drop(['unix_timestamp'], axis=1, inplace=True)

In [6]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [7]:
ratings.describe()

Unnamed: 0,user_id,movie_id,rating
count,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986
std,266.61442,330.798356,1.125674
min,1.0,1.0,1.0
25%,254.0,175.0,3.0
50%,447.0,322.0,4.0
75%,682.0,631.0,4.0
max,943.0,1682.0,5.0


In [8]:
# ratings.drop(columns=["unix_timestamp"],inplace=True)

In [9]:
ratings['user_id'].nunique()

943

In [10]:
ratings['movie_id'].nunique()

1682

## Create Sparse User-Item Matrix

In [11]:
from scipy.sparse import csr_matrix

In [12]:
alpha = 40

In [13]:
ratings.shape[0]

100000

In [14]:
sparse_user_item = csr_matrix( ([alpha]*ratings.shape[0], (ratings['user_id'], ratings['movie_id']) ))

In [15]:
sparse_user_item

<944x1683 sparse matrix of type '<class 'numpy.intc'>'
	with 100000 stored elements in Compressed Sparse Row format>

### shape : 944x1683.  since Total No of users = 943, No of Movies = 1682

### Convert to Array

In [16]:
csr_user_array = sparse_user_item.toarray()

In [17]:
csr_user_array

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0, 40, 40, ...,  0,  0,  0],
       [ 0, 40,  0, ...,  0,  0,  0],
       ...,
       [ 0, 40,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0, 40, ...,  0,  0,  0]], dtype=int32)

In [18]:
csr_user_array.shape

(944, 1683)

In [19]:
len(csr_user_array), len(csr_user_array[0]), csr_user_array[1][1]

(944, 1683, 40)

In [20]:
max(csr_user_array[1])

40

### csr matrix only stores where value is 40 [non-zero]. (Compressed Sparse Row)

In [21]:
print(sparse_user_item)

  (1, 1)	40
  (1, 2)	40
  (1, 3)	40
  (1, 4)	40
  (1, 5)	40
  (1, 6)	40
  (1, 7)	40
  (1, 8)	40
  (1, 9)	40
  (1, 10)	40
  (1, 11)	40
  (1, 12)	40
  (1, 13)	40
  (1, 14)	40
  (1, 15)	40
  (1, 16)	40
  (1, 17)	40
  (1, 18)	40
  (1, 19)	40
  (1, 20)	40
  (1, 21)	40
  (1, 22)	40
  (1, 23)	40
  (1, 24)	40
  (1, 25)	40
  :	:
  (943, 739)	40
  (943, 756)	40
  (943, 763)	40
  (943, 765)	40
  (943, 785)	40
  (943, 794)	40
  (943, 796)	40
  (943, 808)	40
  (943, 816)	40
  (943, 824)	40
  (943, 825)	40
  (943, 831)	40
  (943, 840)	40
  (943, 928)	40
  (943, 941)	40
  (943, 943)	40
  (943, 1011)	40
  (943, 1028)	40
  (943, 1044)	40
  (943, 1047)	40
  (943, 1067)	40
  (943, 1074)	40
  (943, 1188)	40
  (943, 1228)	40
  (943, 1330)	40


### Create item-user sparse matrix

In [22]:
sparse_item_user = sparse_user_item.T.tocsr()

In [23]:
sparse_item_user

<1683x944 sparse matrix of type '<class 'numpy.intc'>'
	with 100000 stored elements in Compressed Sparse Row format>

### shape : 1683x944.  since Total No of Movies = 1682 & No of users = 943

In [24]:
csr_item_array = sparse_item_user.toarray()

In [25]:
csr_item_array

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0, 40, 40, ..., 40,  0,  0],
       [ 0, 40,  0, ...,  0,  0, 40],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int32)

In [26]:
len(csr_item_array), len(csr_item_array[0]), csr_item_array[1][1]

(1683, 944, 40)

In [27]:
print(sparse_item_user)

  (1, 1)	40
  (1, 2)	40
  (1, 5)	40
  (1, 6)	40
  (1, 10)	40
  (1, 13)	40
  (1, 15)	40
  (1, 16)	40
  (1, 17)	40
  (1, 18)	40
  (1, 20)	40
  (1, 21)	40
  (1, 23)	40
  (1, 25)	40
  (1, 26)	40
  (1, 38)	40
  (1, 41)	40
  (1, 42)	40
  (1, 43)	40
  (1, 44)	40
  (1, 45)	40
  (1, 49)	40
  (1, 54)	40
  (1, 56)	40
  (1, 57)	40
  :	:
  (1662, 782)	40
  (1663, 782)	40
  (1664, 782)	40
  (1664, 839)	40
  (1664, 870)	40
  (1664, 880)	40
  (1665, 782)	40
  (1666, 782)	40
  (1667, 782)	40
  (1668, 782)	40
  (1669, 782)	40
  (1670, 782)	40
  (1671, 787)	40
  (1672, 828)	40
  (1672, 896)	40
  (1673, 835)	40
  (1674, 840)	40
  (1675, 851)	40
  (1676, 851)	40
  (1677, 854)	40
  (1678, 863)	40
  (1679, 863)	40
  (1680, 863)	40
  (1681, 896)	40
  (1682, 916)	40


## Create train, test data

In [28]:
# !pip install implicit



In [29]:
from implicit.evaluation import train_test_split

In [30]:

train, test = train_test_split(sparse_item_user, train_percentage=0.8)

In [31]:
train

<1683x944 sparse matrix of type '<class 'numpy.intc'>'
	with 80153 stored elements in Compressed Sparse Row format>

In [32]:
test

<1683x944 sparse matrix of type '<class 'numpy.intc'>'
	with 19847 stored elements in Compressed Sparse Row format>

## Building ALS Model

In [33]:
import implicit

In [34]:
#! pip install implicit

In [60]:

model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20, calculate_training_loss=False)


In [61]:
model

<implicit.cpu.als.AlternatingLeastSquares at 0x1e01db4c748>

### Train the Model

In [63]:
train

<1683x944 sparse matrix of type '<class 'numpy.intc'>'
	with 80153 stored elements in Compressed Sparse Row format>

In [62]:
model.fit(train)

  0%|          | 0/20 [00:00<?, ?it/s]

## Generating recommendations for a user_id

In [38]:
user_id = 117

In [39]:
model.recommend(user_id, sparse_user_item)

[(147, 1.0728588),
 (79, 1.0258682),
 (118, 0.94741225),
 (125, 0.8830537),
 (100, 0.8824407),
 (323, 0.8654951),
 (89, 0.82259876),
 (245, 0.8086411),
 (204, 0.8049158),
 (95, 0.75922775)]

In [40]:
model.recommend(user_id, sparse_user_item, N=30)

[(147, 1.0728588),
 (79, 1.0258682),
 (118, 0.94741225),
 (125, 0.8830537),
 (100, 0.8824407),
 (323, 0.8654951),
 (89, 0.82259876),
 (245, 0.8086411),
 (204, 0.8049158),
 (95, 0.75922775),
 (324, 0.7466264),
 (180, 0.74484277),
 (228, 0.7423659),
 (255, 0.7364071),
 (246, 0.7361276),
 (820, 0.72876585),
 (455, 0.72262967),
 (123, 0.71611434),
 (568, 0.7074597),
 (425, 0.7064353),
 (69, 0.68466544),
 (28, 0.6795821),
 (871, 0.67897975),
 (3, 0.66700685),
 (42, 0.66180074),
 (347, 0.66068524),
 (685, 0.6512251),
 (97, 0.6510793),
 (182, 0.6320306),
 (269, 0.62892735)]

In [41]:
output = model.recommend(user_id, sparse_user_item)

In [42]:
output

[(147, 1.0728588),
 (79, 1.0258682),
 (118, 0.94741225),
 (125, 0.8830537),
 (100, 0.8824407),
 (323, 0.8654951),
 (89, 0.82259876),
 (245, 0.8086411),
 (204, 0.8049158),
 (95, 0.75922775)]

In [43]:
output_df = pd.DataFrame(output, columns=['movie_id', 'als_score'])

In [44]:
output_df

Unnamed: 0,movie_id,als_score
0,147,1.072859
1,79,1.025868
2,118,0.947412
3,125,0.883054
4,100,0.882441
5,323,0.865495
6,89,0.822599
7,245,0.808641
8,204,0.804916
9,95,0.759228


# Load Movies Data

In [45]:
movies = pd.read_csv(r'C:\Users\I324158\OneDrive - SAP SE\Desktop\ML\IIITB\Dataset\Capstone\Recommendation_ex\User-based Collaborative Filtering\movie_genres_final.csv')

In [46]:
movies.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,The Shawshank Redemption,01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [47]:
movies = movies[['movie_id', 'movie title']]

In [48]:
movies.head()

Unnamed: 0,movie_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,The Shawshank Redemption
3,4,Get Shorty (1995)
4,5,Copycat (1995)


# Merge recommendation output with Movies Data

In [49]:
merged = pd.merge(output_df, movies, how='left', on='movie_id')

In [50]:
merged

Unnamed: 0,movie_id,als_score,movie title
0,147,1.072859,"Long Kiss Goodnight, The (1996)"
1,79,1.025868,Fight Club
2,118,0.947412,Twister (1996)
3,125,0.883054,Phenomenon (1996)
4,100,0.882441,Fargo (1996)
5,323,0.865495,Dante's Peak (1997)
6,89,0.822599,Blade Runner (1982)
7,245,0.808641,"Devil's Own, The (1997)"
8,204,0.804916,Back to the Future (1985)
9,95,0.759228,Aladdin (1992)


## Generating recommendations for Movie_id

In [51]:

item_id = 11
n_similar = 10

In [52]:

similar = model.similar_items(item_id, n_similar)


In [53]:
similar

[(11, 1.0000002),
 (22, 0.44697392),
 (12, 0.3858227),
 (92, 0.33545306),
 (96, 0.31688902),
 (317, 0.29801872),
 (172, 0.28533348),
 (55, 0.2770901),
 (79, 0.275666),
 (1429, 0.27245665)]

In [54]:
type(similar)

list

In [55]:
similar[0]

(11, 1.0000002)

In [56]:
similar_df = pd.DataFrame(similar, columns=['movie_id', 'score'])

In [57]:
similar_df

Unnamed: 0,movie_id,score
0,11,1.0
1,22,0.446974
2,12,0.385823
3,92,0.335453
4,96,0.316889
5,317,0.298019
6,172,0.285333
7,55,0.27709
8,79,0.275666
9,1429,0.272457


# Merge recommendation output with Movies Data

In [58]:
merged_similar = pd.merge(similar_df, movies, how='left', on='movie_id')

In [59]:
merged_similar

Unnamed: 0,movie_id,score,movie title
0,11,1.0,Seven (Se7en) (1995)
1,22,0.446974,Braveheart (1995)
2,12,0.385823,"Usual Suspects, The (1995)"
3,92,0.335453,True Romance (1993)
4,96,0.316889,Terminator 2: Judgment Day (1991)
5,317,0.298019,In the Name of the Father (1993)
6,172,0.285333,"Empire Strikes Back, The (1980)"
7,55,0.27709,"Professional, The (1994)"
8,79,0.275666,Fight Club
9,1429,0.272457,Sliding Doors (1998)
