#  <center> Exploratory Data Analysis </center>

## Importing necessary modules for data analysis

In [33]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pickle

## 1) Importing events dataset

In [3]:
# Reading event datasets 
events = pd.read_csv("../data/events.csv")
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [4]:
# Describing quantitative features of events dataset
print ("No. of rows in events dataset : {0}".format(len(events)))
events.describe()

No. of rows in events dataset : 2756101


Unnamed: 0,timestamp,visitorid,itemid,transactionid
count,2756101.0,2756101.0,2756101.0,22457.0
mean,1436424000000.0,701922.9,234922.5,8826.497796
std,3366312000.0,405687.5,134195.4,5098.99629
min,1430622000000.0,0.0,3.0,0.0
25%,1433478000000.0,350566.0,118120.0,4411.0
50%,1436453000000.0,702060.0,236067.0,8813.0
75%,1439225000000.0,1053437.0,350715.0,13224.0
max,1442545000000.0,1407579.0,466867.0,17671.0


In [5]:
# statistics for categorical variables
events.describe(include=[np.object])

Unnamed: 0,event
count,2756101
unique,3
top,view
freq,2664312


## 2) Importing item_properites dataset

In [6]:
item_prop = pd.read_csv("../data/item_properties_part1.csv")
print ("No. of rows in item_prop : {0}".format(len(item_prop)))
item_prop.head()

No. of rows in item_prop : 10999999


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [31]:
def getItemIdPropVector(item_prop, itemidvalue):
    """
    Given   : item_prop (dataframe), itemid
    Returns : get vector-set of 'property' in 'item_prop' DF for which item_prop.itemid == itemidvalue
    """
    
    return set(item_prop.loc[item_prop.itemid == itemidvalue].property.values)

def getListOfItemId(item_prop):
    """
    Given   : item_prop dataframe
    Returns : unique set of item_prop.itemid
    """
    
    return set(item_prop.itemid.drop_duplicates().values)

def getDictOfItemIdVector(item_prop):
    """
    Given   : item_prop as pandas dataframe
    Return  : dictionary of ItemIdVector of all unique items in the product catalog
    """
    dictOfItemIdVector = defaultdict(set)
    for index, row in item_prop.iterrows():
        dictOfItemIdVector[row['itemid']].add(row['property'])
        
    return dictOfItemIdVector
    

itemidVector = getItemIdPropVector(item_prop, 460429)
itemidVector

{'127',
 '202',
 '283',
 '364',
 '6',
 '776',
 '839',
 '884',
 '917',
 'available',
 'categoryid'}

In [32]:
dictOfItemIdVector = getDictOfItemIdVector(item_prop)

In [53]:
dictOfItemIdVector = defaultdict(list, ((k, list(v)) for k, v in dictOfItemIdVector.items()))
dictOfItemIdVector2  = dictOfItemIdVector.copy()

In [60]:
# with open('../output/dictOfItemVector.pickle', 'wb') as handle:
#     pickle.dump(dictOfItemIdVector, handle, protocol=pickle.HIGHEST_PROTOCOL)

def square_rooted(x):
 
    return round(np.sqrt(np.sum(np.dot(x,x))),3)
 
def cosine_similarity(x,y):
 
    numerator = np.sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return round(numerator/float(denominator),3)

In [62]:
cosine_similarity(dictOfItemIdVector[0],dictOfItemIdVector[1])

TypeError: can't multiply sequence by non-int of type 'str'

In [63]:
np.dot(dictOfItemIdVector[0],dictOfItemIdVector[0])

ValueError: data type must provide an itemsize

In [73]:
# np.array(dictOfItemIdVector[0],dtype=float)
dictOfItemIdVector[0].remove("available")

In [74]:
dictOfItemIdVector[0]

['888',
 '112',
 '678',
 '1056',
 '776',
 '917',
 '189',
 '225',
 '227',
 '364',
 '283',
 '159',
 '6']