#  <center> Exploratory Data Analysis </center>

## Importing necessary modules for data analysis

In [33]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pickle

## 1) Importing events dataset

In [3]:
# Reading event datasets 
events = pd.read_csv("../data/events.csv")
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [4]:
# Describing quantitative features of events dataset
print ("No. of rows in events dataset : {0}".format(len(events)))
events.describe()

No. of rows in events dataset : 2756101


Unnamed: 0,timestamp,visitorid,itemid,transactionid
count,2756101.0,2756101.0,2756101.0,22457.0
mean,1436424000000.0,701922.9,234922.5,8826.497796
std,3366312000.0,405687.5,134195.4,5098.99629
min,1430622000000.0,0.0,3.0,0.0
25%,1433478000000.0,350566.0,118120.0,4411.0
50%,1436453000000.0,702060.0,236067.0,8813.0
75%,1439225000000.0,1053437.0,350715.0,13224.0
max,1442545000000.0,1407579.0,466867.0,17671.0


In [5]:
# statistics for categorical variables
events.describe(include=[np.object])

Unnamed: 0,event
count,2756101
unique,3
top,view
freq,2664312


## 2) Importing item_properites dataset

In [89]:
item_prop = pd.read_csv("../data/item_properties_part1.csv")
print ("No. of rows in item_prop : {0}".format(len(item_prop)))
item_prop.head()

No. of rows in item_prop : 10999999


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


## 3) Formatting property feature to numeric type

In [90]:
# replace categoryid with 0, and available with 1
def preProcessDF(item_prop):
    item_prop['property'] = item_prop['property'].replace(["categoryid"], 0)
    item_prop['property'] = item_prop['property'].replace(["available"], 1)
    item_prop['property'] = pd.to_numeric(item_prop.property.values)
    return item_prop

item_prop = preProcessDF(item_prop.copy())
item_prop.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,0,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


## 4) Vectorizing itemid feature from item_prop dataFrame

In [101]:
def getDictOfItemIdVector(item_prop):
    """
    Given   : item_prop as pandas dataframe
    Return  : dictionary of ItemIdVector of all unique items in the product catalog
    """
    dictOfItemIdVector = defaultdict(set)
    for index, row in item_prop.iterrows():
        dictOfItemIdVector[row['itemid']].add(row['property'])
        
    return dictOfItemIdVector
    
def writeDictToFile(dictToFile, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(dictToFile, protocol=pickle.HIGHEST_PROTOCOL)
    
    
# dictOfItemIdVector = getDictOfItemIdVector(item_prop)
# dictOfItemIdVector = defaultdict(list, ((k, list(v)) for k, v in dictOfItemIdVector.items()))
# dictOfItemIdVector2  = dictOfItemIdVector.copy()
# writeDictToFile(dictOfItemIdVector, "../output/dictOfItemVector2.pickle")

## Compute Cosine similarity of 2 vectors

In [111]:
def square_rooted(x):
 
    return round(np.sqrt(np.sum(np.dot(x,x))),3)
 
def cosine_similarity(x,y):
 
    numerator = np.sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return round(numerator/float(denominator),3)

cosine_similarity(dictOfItemIdVector[0], dictOfItemIdVector[3])

1.0

## 6) Build userDict from events DF

In [132]:
userDict = defaultdict(lambda : defaultdict(list))

In [135]:
userDict[1]['view'].append(1)
userDict[1]['view'].append(2)
userDict[1]['addtocart'].append(100)

In [136]:
userDict

defaultdict(<function __main__.<lambda>>,
            {1: defaultdict(list, {'addtocart': [100], 'view': [1, 1, 2]})})

In [142]:
def getDictOfUser(events):
    """
    Given   : events as pandas dataframe
    Return  : dictionary of all users- This can be treated as userProfiles
    """
    userDict = defaultdict(lambda : defaultdict(list))
    for index, row in events.iterrows():
        if row['event']=='view':
            userDict[row['visitorid']]['view'].append(row['itemid'])
        if row['event']=='addtocart':
            userDict[row['visitorid']]['addtocart'].append(row['itemid'])
            
    return userDict.copy()

userDict = getDictOfUser(events)

KeyboardInterrupt: 

In [139]:
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
