In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import os, sys
import re
import seaborn as sns

In [28]:
products = pd.read_csv('newproduct.csv', sep=',', error_bad_lines=False, encoding="latin-1")
products.columns = ['Product_id', 'Product-Title']
users = pd.read_csv('user.csv', sep=',', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
search = pd.read_csv('user_search.csv', sep=',', error_bad_lines=False, encoding="latin-1")
search.columns = ['userID', 'Product_id', 'search_count']

In [29]:
print (products.shape)
print (users.shape)
print (search.shape)

(2658, 2)
(278859, 3)
(1048575, 3)


In [30]:
#Exploring products dataset
products.head()

Unnamed: 0,Product_id,Product-Title
0,195153448,Classical Mythology
1,2005018,Clara Callan
2,60973129,Decision in Normandy
3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi


In [42]:
search.head()

Unnamed: 0,userID,Product_id,search_count
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [31]:
users.head()

Unnamed: 0,userID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [32]:
products.dtypes

Product_id       object
Product-Title    object
dtype: object

In [33]:
print( users.shape)
users.head()

(278859, 3)


Unnamed: 0,userID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [34]:
users.dtypes


userID       object
Location     object
Age         float64
dtype: object

In [36]:
print (sorted(users.Age.unique()))

[nan, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 118.0, 119.0, 123.0, 124.0, 127.0, 128.0, 132.0, 133.0, 136.0, 137.0, 138.0, 140.0, 141.0, 143.0, 146.0, 147.0, 148.0, 151.0, 152.0, 156.0, 157.0, 159.0, 162.0, 168.0, 172.0, 175.0, 183.0, 186.0, 189.0, 199.0, 200.0, 201.0, 204.0, 207.0, 208.0, 209.0, 210.0, 212.0, 219.0, 220.0, 223.0, 226.0

In [37]:
#Age column has some invalid entries like nan, 0 and very high values like 100 and above
#In my view values below 5 and above 90 do not make much sense for our project...hence replacing these by NaNs
users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan


In [38]:
#replacing NaNs with mean
users.Age = users.Age.fillna(users.Age.mean())

In [39]:
#setting the data type as int
users.Age = users.Age.astype(np.int32)

In [41]:
#rechecking
print (sorted(users.Age.unique()))
#looks good now

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90]


In [46]:
#At this point , a simple popularity based recommendation system can be built based on count of user search_counts for different products
ratings_count = pd.DataFrame(search.groupby(['Product_id'])['search_count'].sum())
top10 = ratings_count.sort_values('search_count', ascending = False).head(10)
print( "Following products are recommended" )
top10.merge(products, left_index = True, right_on = 'Product_id')

#Given below are top 10 recommendations based on popularity

Following products are recommended


Unnamed: 0,search_count,Product_id,Product-Title
408,5188,316666343,The Lovely Bones: A Novel
748,3814,385504209,The Da Vinci Code
522,2890,312195516,The Red Tent (Bestselling Backlist)
2143,2546,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...
356,2404,142001740,The Secret Life of Bees
26,2337,971880107,Wild Animus
1105,2292,60928336,Divine Secrets of the Ya-Ya Sisterhood: A Novel
706,2145,446672211,Where the Heart Is (Oprah's Book Club (Paperba...
118,2043,671027360,Angels &amp; Demons
