# **IBM Hack CHallenge**
## *AI Based Personalized Electronic Product Recommendation System*
## ***Notebook-1: Implementing the Machine Learning models for product recommendation***

### Participants:
1. Atyam V V R Manoj
2. Narra Varsha



### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import operator
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
from textblob import TextBlob
from sklearn.metrics.pairwise import cosine_similarity
# importing the pickle library
import pickle
import os

# to ignore warnings
import warnings 
# Set action = "ignore" to ignore warnings
warnings.filterwarnings(action= 'ignore')


In [2]:
os.getcwd()

'/content'

### Importing the pre-processed datasets

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
new_Elec = pd.read_csv(r'/content/drive/MyDrive/Recommendation-System/ML Mini Project/Pre_Processed Datasets/final_Elec')

In [5]:
new_Elec.head()

Unnamed: 0,UserID,ProductID,rating,tags
0,A2T7YFEAI0X74W,B000P9JTW6,4.0,nifty It is very convenient to have the chargi...
1,A1Y8DLP78Z778V,B0000DFZ2U,5.0,APC P1T 120V Portable Surge Protector Perfect ...
2,AAF3I3SZ6CXGY,B0000DFZ2U,5.0,Insurance policy! This protector is an excelle...
3,ARA1MZKJQLXGE,B0000DFZ2U,5.0,Works great for appliances I bought one of the...
4,A2P9Q94AA63517,B0000DFZ2U,5.0,Single Surge Protector Just right for travel f...


### Sentiment Analysis

In [None]:
#sentimental analysis of the text
new_Elec['polarity'] = new_Elec['tags'].apply(lambda x: TextBlob(x).sentiment.polarity)
new_Elec['review_len'] = new_Elec['tags'].apply(lambda x:len(x))
new_Elec['word_count'] = new_Elec['tags'].apply(lambda x: len(x.split()))

KeyboardInterrupt: ignored

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(new_Elec['rating']);

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(new_Elec['rating']);

In [None]:
sns.catplot(x='rating', y='polarity', data=new_Elec)

In [None]:
new_Elec

### Manipulating the attributes in order to convert them into required form for the ML Model implementation

In [6]:
print("There are ",pd.DataFrame(new_Elec.ProductID.value_counts()).shape[0]," unique products in the dataset")
print("There are ",pd.DataFrame(new_Elec.UserID.value_counts()).shape[0]," unique users in the dataset")
    
new_Elec.groupby('ProductID').count()
    

There are  30584  unique products in the dataset
There are  449774  unique users in the dataset


Unnamed: 0_level_0,UserID,rating,tags
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0226534650,2,2,2
0312282982,43,43,43
0345480902,8,8,8
0393320073,10,10,10
0465019765,8,8,8
...,...,...,...
B000UVIEZC,6,6,6
B000V01RLK,113,113,113
B000V03K2O,3,3,3
B000V1K4WC,2,2,2


In [7]:
print("Number of Ratings for each product:")
num_rating = new_Elec.groupby('ProductID').count()['rating']
num_rating = pd.DataFrame(num_rating)
num_rating

Number of Ratings for each product:


Unnamed: 0_level_0,rating
ProductID,Unnamed: 1_level_1
0226534650,2
0312282982,43
0345480902,8
0393320073,10
0465019765,8
...,...
B000UVIEZC,6
B000V01RLK,113
B000V03K2O,3
B000V1K4WC,2


In [8]:
num_rating.rename(columns = {'rating':'num_ratings'},inplace = True)
num_rating.sort_values(by = 'num_ratings',ascending = False)

Unnamed: 0_level_0,num_ratings
ProductID,Unnamed: 1_level_1
B0002L5R78,7588
B000LRMS66,5875
B000JMJWV2,3298
B0001FTVEK,3188
B000NWS3SG,2965
...,...
B000293QCI,1
B0009EFMIM,1
B000293P6A,1
B000293OH0,1


In [9]:
#Computing the average rating of the products
avg_rating = new_Elec.groupby('ProductID').mean()['rating']
avg_rating = pd.DataFrame(avg_rating)
avg_rating

Unnamed: 0_level_0,rating
ProductID,Unnamed: 1_level_1
0226534650,3.000000
0312282982,2.906977
0345480902,4.375000
0393320073,3.700000
0465019765,4.000000
...,...
B000UVIEZC,3.666667
B000V01RLK,4.371681
B000V03K2O,2.333333
B000V1K4WC,5.000000


In [10]:
avg_rating.rename(columns = {'rating':'avg_ratings'},inplace = True)
avg_rating.sort_values(by = 'avg_ratings',ascending = False)

Unnamed: 0_level_0,avg_ratings
ProductID,Unnamed: 1_level_1
B000HI9OZW,5.0
B000KL2SB8,5.0
B00009UTNN,5.0
B000KL2PEI,5.0
B00009UTNP,5.0
...,...
B00008SCFZ,1.0
B000J35D1Y,1.0
B000KL4N70,1.0
B0002H0MS6,1.0


In [11]:
popular_df = num_rating.merge(avg_rating ,on = 'ProductID')
popular_df

Unnamed: 0_level_0,num_ratings,avg_ratings
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1
0226534650,2,3.000000
0312282982,43,2.906977
0345480902,8,4.375000
0393320073,10,3.700000
0465019765,8,4.000000
...,...,...
B000UVIEZC,6,3.666667
B000V01RLK,113,4.371681
B000V03K2O,3,2.333333
B000V1K4WC,2,5.000000


In [12]:
# making a limit on the ratings
popular_df[popular_df['num_ratings']>2]
x = new_Elec.groupby('UserID').count()['rating'] > 3  
rated_users = x[x].index
rated_users

Index(['A09600262CLBSRBGU2VTY', 'A1009BUD60IYKK', 'A100UD67AHFODS',
       'A100WO06OQR8BQ', 'A101KT366JGXGV', 'A101OAAMZYWQ3U',
       'A10209912CAX51A47I9AW', 'A1028XZRNI0NRP', 'A102EC3XGCBZ81',
       'A1031R8HD3E4GL',
       ...
       'AZWIMAFNLP8VB', 'AZWOPBY75SGAM', 'AZWPFZ9M5HLO9', 'AZWQEM8GKXQ5Y',
       'AZX7GJRLMWN92', 'AZXIGU9MBPYW', 'AZXS6P5QWNMLC', 'AZYJE40XW6MFG',
       'AZZ8HGLS4A7Y4', 'unknown'],
      dtype='object', name='UserID', length=12981)

In [13]:
filtered_rating = new_Elec[new_Elec['UserID'].isin(rated_users)]
filtered_rating

Unnamed: 0,UserID,ProductID,rating,tags
0,A2T7YFEAI0X74W,B000P9JTW6,4.0,nifty It is very convenient to have the chargi...
6,A29KKDPYJM96I,B0000DFZ2U,1.0,$25K warranty is not included in packaging or ...
12,A242NHKVYIYHHP,B0000DFZ2U,5.0,It is a surge protector What can I say it is a...
14,A3V6Z4RCDGRC44,B0000DFZ2U,5.0,"Easy to Carry When you are traveling, often yo..."
35,ALBD3P1D8H1JI,B0001YXWVO,1.0,Didn't Last 6 months I decided to give this a ...
...,...,...,...,...
597109,unknown,B000051JUJ,5.0,Businessman's viewpoint Got it on Christmas 20...
597111,unknown,B000051JUJ,1.0,"Buy something else! Great idea, extremely poor..."
597112,unknown,B000051JUJ,3.0,unhappy with compaq I love my ipaq but the lac...
597120,A1XOK2E84T1WY,B000PH2YR0,5.0,My Favorite card SanDisk is the card I prefer....


In [14]:
y = filtered_rating.groupby('ProductID').count()['rating']>=0
famous_products = y[y].index
famous_products

Index(['0312282982', '0393320073', '0465019765', '0671793969', '0743400054',
       '1587153440', '1590802985', '9043413585', '9864216155', 'B000000O2F',
       ...
       'B000U0FBJ0', 'B000U0S1ZQ', 'B000U0S304', 'B000U0UZUK', 'B000U67NUY',
       'B000UA0JMO', 'B000UF3FUC', 'B000UFQYFU', 'B000V01RLK', 'B000V4XE0I'],
      dtype='object', name='ProductID', length=14167)

## Creating a Pivot table

In [15]:
final_ratings = filtered_rating[filtered_rating['ProductID'].isin(famous_products)]

In [16]:
pivot_table = final_ratings.pivot_table(index='ProductID',columns='UserID',values='rating')
pivot_table.fillna(0,inplace = True)
pivot_table

UserID,A09600262CLBSRBGU2VTY,A1009BUD60IYKK,A100UD67AHFODS,A100WO06OQR8BQ,A101KT366JGXGV,A101OAAMZYWQ3U,A10209912CAX51A47I9AW,A1028XZRNI0NRP,A102EC3XGCBZ81,A1031R8HD3E4GL,...,AZWIMAFNLP8VB,AZWOPBY75SGAM,AZWPFZ9M5HLO9,AZWQEM8GKXQ5Y,AZX7GJRLMWN92,AZXIGU9MBPYW,AZXS6P5QWNMLC,AZYJE40XW6MFG,AZZ8HGLS4A7Y4,unknown
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0312282982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.428571
0393320073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.000000
0465019765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.500000
0671793969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.000000
0743400054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B000UA0JMO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
B000UF3FUC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
B000UFQYFU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
B000V01RLK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [17]:
# Implementing the Cosine Similarity function

from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(pivot_table)
similarity_scores.shape

(14167, 14167)

In [18]:
sorted(list(enumerate(similarity_scores[70])),key = lambda x:x[1],reverse = True)


[(70, 0.9999999999999998),
 (338, 0.37742567804819854),
 (13040, 0.2213776784841062),
 (12751, 0.21459425109358532),
 (12485, 0.20672455764868072),
 (12412, 0.2011685623271948),
 (1053, 0.19998831366131992),
 (10451, 0.18307834546036322),
 (8119, 0.17132130891822073),
 (5755, 0.16723623980804123),
 (309, 0.1409194494516492),
 (310, 0.1409194494516492),
 (661, 0.1409194494516492),
 (12952, 0.13902304219833353),
 (8937, 0.10315617238032847),
 (0, 0.09245003270420485),
 (1, 0.09245003270420485),
 (2, 0.09245003270420485),
 (4, 0.09245003270420485),
 (5, 0.09245003270420485),
 (6, 0.09245003270420485),
 (25, 0.09245003270420485),
 (26, 0.09245003270420485),
 (31, 0.09245003270420485),
 (32, 0.09245003270420485),
 (37, 0.09245003270420485),
 (44, 0.09245003270420485),
 (49, 0.09245003270420485),
 (50, 0.09245003270420485),
 (51, 0.09245003270420485),
 (53, 0.09245003270420485),
 (55, 0.09245003270420485),
 (62, 0.09245003270420485),
 (65, 0.09245003270420485),
 (67, 0.09245003270420485),
 (

In [19]:
similarity_scores

array([[1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.89442719],
       [1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.89442719],
       [1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        0.89442719],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.89442719, 0.89442719, 0.89442719, ..., 0.        , 0.        ,
        1.        ]])

### Converting the variables into pickle files, so they can be used for further analysis

In [20]:
with open(r'/content/drive/MyDrive/Recommendation-System/ML Mini Project/Elec_pivot_table.pkl', 'wb') as f:pickle.dump(pivot_table,f)

In [21]:
with open(r'/content/drive/MyDrive/Recommendation-System/ML Mini Project/Elec_products_dict.pkl', 'wb') as f:pickle.dump(final_ratings.to_dict(),f)

In [23]:
with open(r'/content/drive/MyDrive/Recommendation-System/ML Mini Project/Elec_similarity_scores.pkl', 'wb') as f:pickle.dump(similarity_scores,f)