## Libraries

In [1]:
from surprise.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from surprise import Dataset
from surprise import Reader
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('similar users.csv')

In [3]:
ratings.head(10)

Unnamed: 0,ITEM,ProductId,Rating,UserId
0,toothpaste,6175005570,5,A39HTATAQ9V7YF
1,whole wheat pasta,1304488608,2,A3JM6GV9MNOF9X
2,white wine,6167061580,4,A1Z513UWSAAO0F
3,grated cheese,1304651104,3,A1WMRR494NWEWV
4,whole weat flour,1304139212,4,A3IAAVS479H7M7
5,eggplant,5357955867,2,AKJHHD5VEH7VG
6,burger sauce,5357955454,2,A1BG8QW55XHN6U
7,french wine,1412759676,5,A22VW0P4VZHDE3
8,cauliflower,6165151329,1,A3V3RE4132GKRO
9,yams,1304139220,4,A327B0I7CYTEJC


### No of Uers and Products

In [4]:
# Total
n_users = ratings.UserId.shape[0]
n_items = ratings.ProductId.shape[0]

print(n_users,n_items)

26375 26375


In [5]:
# Unique
n_users = ratings.UserId.unique().shape[0]
n_items = ratings.ProductId.unique().shape[0]

print(n_users,n_items)

1498 236


### Users who have rated more than 20 times

In [6]:
x = ratings['UserId'].value_counts()>20
x

A2P8AUMTM6O605     True
A11OEJ4R1PDTLN     True
A1Z03R982SX8V2     True
AZ4FR14IPKML7      True
A35XCJ5P6ZKRE9     True
                  ...  
A1JYB44V9JVXS7    False
A2FBSNHDOGYASX    False
A3U0K46TK0WT3M    False
A2OQVECX74KQJQ    False
A18MCI81UN5SA0    False
Name: UserId, Length: 1498, dtype: bool

In [7]:
y = x[x].index
y

Index(['A2P8AUMTM6O605', 'A11OEJ4R1PDTLN', 'A1Z03R982SX8V2', 'AZ4FR14IPKML7',
       'A35XCJ5P6ZKRE9', 'ATPHIBSTV1NY5', 'A1VK4ALI1QOF5U', 'A1ZY0K46FYOTB6',
       'A1MLJHYS35C4X6', 'A3MAATZ091QZGE',
       ...
       'A2RNK4TW1FPXEF', 'A3EB7LZY4SO8S4', 'A38JQUXZTKKHQP', 'A1ZOGA1A495OHV',
       'A2P2FTA24RAFBA', 'A2X0OUA7W3HNZI', 'A2YWHJXGR6R00D', 'A3590W7FEAGQY4',
       'A29WP9HRKLQS0D', 'A3JRZEM49Z26XF'],
      dtype='object', length=415)

In [8]:
ratings = ratings[ratings['UserId'].isin(y)]

In [9]:
ratings.head

<bound method NDFrame.head of                    ITEM    ProductId  Rating          UserId
5              eggplant   5357955867       2   AKJHHD5VEH7VG
6          burger sauce   5357955454       2  A1BG8QW55XHN6U
7           french wine   1412759676       5  A22VW0P4VZHDE3
8           cauliflower   6165151329       1  A3V3RE4132GKRO
9                  yams   1304139220       4  A327B0I7CYTEJC
...                 ...          ...     ...             ...
26370  dark forest cake  12324565978       2  A294IJJRMVOBSE
26371  mustard hot dogs  12324565979       2  A3FPJWX0GHJCCG
26372      dog pet food  12324565980       1   A5UQ98GXX7T47
26373   cheese sandwich  12324565981       5  A13G2YWZQ3522K
26374   protein fat bar  12324565982       4  A1Q3W2FX8LKH19

[11731 rows x 4 columns]>

In [10]:
no_of_ratings = ratings.groupby('ITEM')['Rating'].count().reset_index()

In [11]:
no_of_ratings.rename(columns={'Rating':'No. of Ratings'},inplace = True)
no_of_ratings

Unnamed: 0,ITEM,No. of Ratings
0,Parmigiano Reggiano,3
1,african yams,4
2,aged champagne,3
3,aged red wine,3
4,aged white wine,3
...,...,...
224,whole wheat rice,77
225,whole wheat spagetti,4
226,yams,91
227,yogurt cake,130


In [12]:
final_ratings = ratings.merge(no_of_ratings, on='ITEM')
final_ratings

Unnamed: 0,ITEM,ProductId,Rating,UserId,No. of Ratings
0,eggplant,5357955867,2,AKJHHD5VEH7VG,123
1,eggplant,5357955867,5,A9ZNJ2AXPO652,123
2,eggplant,5357955867,4,A3BMQUL5QATQIW,123
3,eggplant,5357955867,3,A1T8YQ4FFMWQPC,123
4,eggplant,5357955867,5,A1ZY0K46FYOTB6,123
...,...,...,...,...,...
11726,whole wheat spagetti,12324565998,4,AC4343QXC1BCQ,4
11727,oat pasta,12324565999,4,A445U3NN5XBYK,4
11728,oat pasta,12324565999,4,A7SVI5Q1WC3UP,4
11729,oat pasta,12324565999,4,A445U3NN5XBYK,4


In [13]:
# IGNORE THIS CELL
# final_ratings = final_ratings[final_ratings['No. of Ratings']>=90]
# final_ratings.shape

In [14]:
final_ratings.drop_duplicates(['ITEM','UserId'],inplace=True)
final_ratings.shape

(6687, 5)

In [15]:
item_pivot = final_ratings.pivot_table(columns = 'UserId', index = 'ITEM', values = 'Rating')
item_pivot.fillna(0, inplace = True)
item_pivot

UserId,A00205921JHJK5X9LNP42,A024581134CV80ZBLIZTZ,A05492663T95KW63BR75K,A100GYE1W4OXZ8,A10REFE1TW3ZVT,A10ZKDOZOSH219,A11AT6B3912DCU,A11OEJ4R1PDTLN,A11PBM6FRXUJ5Q,A11Q1HQ84F8POH,...,AXG1D956QCMHI,AXVLSEEYTC4Z9,AY25UTTDNJ5SX,AYFZQPNEVSFMD,AYHZNXRPOHMO2,AYI1LDJYGG0ZK,AYM7E80UCJX7I,AZ4FR14IPKML7,AZLE2XFH3JUNG,AZWRTJPN7NXT
ITEM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Parmigiano Reggiano,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
african yams,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aged champagne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aged red wine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aged white wine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
whole wheat rice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
whole wheat spagetti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yams,0.0,0.0,0.0,0.0,5.0,0.0,2.0,5.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0
yogurt cake,0.0,0.0,2.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,...,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
item_sparse = csr_matrix(item_pivot)
item_sparse

<229x415 sparse matrix of type '<class 'numpy.float64'>'
	with 6687 stored elements in Compressed Sparse Row format>

### Model

In [17]:
model = NearestNeighbors(algorithm='brute')

In [18]:
model.fit(item_sparse)

NearestNeighbors(algorithm='brute')

In [44]:
distances, suggestions = model.kneighbors(item_pivot.loc['baby corn', : ].values.reshape(1,-1), n_neighbors = 6)

In [45]:
print(type(item_pivot.loc['mango cream', : ]))

<class 'pandas.core.series.Series'>


In [46]:
suggestions

array([[ 13, 142, 199, 101, 184, 126]], dtype=int64)

In [47]:
#suggestions for turkey by similar users
for i in range(len(suggestions)):
    print(item_pivot.index[suggestions[i]])

Index(['baby corn', 'mint powder', 'strawberry sparkling water',
       'frozon vegetables mix', 'seasoned herb & pepper',
       'low fat cooking oil'],
      dtype='object', name='ITEM')


### Saving the model and the pivot table

In [37]:
#Saving Pivot table
import joblib
pkl_file = open("item_pivot.pkl","wb")
joblib.dump(item_pivot,pkl_file)
pkl_file.close()

In [38]:
#Saving Model
import joblib
pkl_file = open("similarUser.pkl","wb")
joblib.dump(model,pkl_file)
pkl_file.close()

In [39]:
#Load Model and pivot table
data = joblib.load('similarUser.pkl')
pt = joblib.load('item_pivot.pkl')

In [42]:
#Try Outs
a = 'mango cream'
distances, suggestions = data.kneighbors(pt.loc[a, : ].values.reshape(1,-1), n_neighbors = 6)

In [43]:
for i in range(len(suggestions)):
    print(pt.index[suggestions[i]])

Index(['mango cream', 'lemon black tea', 'strawberry sparkling water',
       'mint powder', 'salted toothpaste', 'frozon vegetables mix'],
      dtype='object', name='ITEM')
