#  <center> Exploratory Data Analysis </center>

## Importing necessary modules for data analysis

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pickle

## 1) Importing events dataset

In [2]:
# Reading event datasets 
events = pd.read_csv("../data/events.csv")
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [3]:
# Describing quantitative features of events dataset
print ("No. of rows in events dataset : {0}".format(len(events)))
events.describe()

No. of rows in events dataset : 2756101


Unnamed: 0,timestamp,visitorid,itemid,transactionid
count,2756101.0,2756101.0,2756101.0,22457.0
mean,1436424000000.0,701922.9,234922.5,8826.497796
std,3366312000.0,405687.5,134195.4,5098.99629
min,1430622000000.0,0.0,3.0,0.0
25%,1433478000000.0,350566.0,118120.0,4411.0
50%,1436453000000.0,702060.0,236067.0,8813.0
75%,1439225000000.0,1053437.0,350715.0,13224.0
max,1442545000000.0,1407579.0,466867.0,17671.0


In [4]:
# statistics for categorical variables
events.describe(include=[np.object])

Unnamed: 0,event
count,2756101
unique,3
top,view
freq,2664312


## 2) Importing item_properites dataset

In [5]:
item_prop = pd.read_csv("../data/item_properties_part1.csv")
print ("No. of rows in item_prop : {0}".format(len(item_prop)))
item_prop.head()

No. of rows in item_prop : 10999999


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


## 3) Formatting property feature to numeric type

In [6]:
# replace categoryid with 0, and available with 1
def preProcessDF(item_prop):
    item_prop['property'] = item_prop['property'].replace(["categoryid"], 0)
    item_prop['property'] = item_prop['property'].replace(["available"], 1)
    item_prop['property'] = pd.to_numeric(item_prop.property.values)
    return item_prop

item_prop = preProcessDF(item_prop.copy())
item_prop.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,0,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


## 4) Vectorizing itemid feature from item_prop dataFrame

In [10]:
def getDictOfItemIdVector(item_prop):
    """
    Given   : item_prop as pandas dataframe
    Return  : dictionary of ItemIdVector of all unique items in the product catalog
    """
    dictOfItemIdVector = defaultdict(set)
    for index, row in item_prop.iterrows():
        dictOfItemIdVector[row['itemid']].add(row['property'])
        
    return dictOfItemIdVector
    
def writeDictToFile(dictToFile, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(dictToFile, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def readDictFromFile(filename):
    with open(filename, 'rb') as handle:
        dictToFile = pickle.load(handle)
    return dictToFile
    
    
# dictOfItemIdVector = getDictOfItemIdVector(item_prop)
# dictOfItemIdVector = defaultdict(list, ((k, list(v)) for k, v in dictOfItemIdVector.items()))
# dictOfItemIdVector2  = dictOfItemIdVector.copy()
# writeDictToFile(dictOfItemIdVector, "../output/dictOfItemVector2.pickle")


# read dict from file
dictOfItemIdVector = readDictFromFile("../output/dictOfItemVector.pickle")

## 5) Compute Cosine similarity of 2 vectors

In [12]:
def square_rooted(x):
 
    return round(np.sqrt(np.sum(np.dot(x,x))),3)
 
def cosine_similarity(x,y):
 
    numerator = np.sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return round(numerator/float(denominator),3)

cosine_similarity(dictOfItemIdVector[0], dictOfItemIdVector[0])

1.0

## 6) Build userDict from events DF

In [24]:
def getDictOfUser(events):
    """
    Given   : events as pandas dataframe
    Return  : dictionary of all users- This can be treated as userProfiles
    """
    userDict = defaultdict(lambda : defaultdict(list))
    for index, row in events.iterrows():
        if row['event']=='view':
            userDict[row['visitorid']]['view'].append(row['itemid'])
        if row['event']=='addtocart':
            userDict[row['visitorid']]['addtocart'].append(row['itemid'])
            
    return userDict.copy()

userDict = getDictOfUser(events)

In [36]:
userDict

defaultdict(<function __main__.getDictOfUser.<locals>.<lambda>>,
            {0: defaultdict(list, {'view': [285930, 357564, 67045]}),
             1: defaultdict(list, {'view': [72028]}),
             2: defaultdict(list,
                         {'view': [216305,
                           325215,
                           342816,
                           325215,
                           342816,
                           259884,
                           216305,
                           325215]}),
             3: defaultdict(list, {'view': [385090]}),
             4: defaultdict(list, {'view': [177677]}),
             5: defaultdict(list, {'view': [61396]}),
             6: defaultdict(list,
                         {'addtocart': [65273],
                          'view': [344723, 344723, 253615, 344723, 344723]}),
             7: defaultdict(list, {'view': [164941, 139394, 226353]}),
             8: defaultdict(list, {'view': [434230]}),
             9: defaultdict(list, {'

In [81]:
userDict2 = userDict.copy()
userDictTemp = defaultdict(lambda : defaultdict(list))
for key, value in userDict.items():
    if 'addtocart' in value and 'view' in value:
        userDictTemp[key] = value

userDict = userDictTemp.copy()
userDict      

defaultdict(<function __main__.<lambda>>,
            {786432: defaultdict(list,
                         {'addtocart': [325585],
                          'view': [58086, 325585, 58086, 325585]}),
             1179650: defaultdict(list,
                         {'addtocart': [169203, 169203, 351154], 'view': []}),
             917508: defaultdict(list,
                         {'addtocart': [218033],
                          'view': [218033, 218033, 22839, 218033]}),
             6: defaultdict(list,
                         {'addtocart': [65273],
                          'view': [344723, 344723, 253615, 344723, 344723]}),
             1310734: defaultdict(list,
                         {'addtocart': [435940], 'view': [390399, 443094]}),
             131092: defaultdict(list,
                         {'addtocart': [262826], 'view': [262826]}),
             393237: defaultdict(list,
                         {'addtocart': [218794, 36972, 461686],
                          'view': [369

In [82]:
def calAvgItemViewLen(userDict):
    viewLenList = []
    for key, value in userDict.items():
          viewLenList.append(len(userDict[key]["view"]))
            
    return viewLenList[:]
            
viewLenList = calAvgItemViewLen(userDict)

## Create empty DataFrame

In [92]:
userDF = pd.DataFrame(columns=['userid', 'view1', 'view2', 'view3','addtocart'])
userDF.head()

Unnamed: 0,userid,view1,view2,view3,addtocart


In [98]:
userDict[524440]

defaultdict(list,
            {'addtocart': [394217], 'view': [394217, 394217, 394217, 394217]})

In [249]:
no_of_features = 4
def addEntrytoUserDF(userid, viewList, addToCart):
#     print ("Inside addEntryUserDF. Value of viewList : {0}".format(viewList))
    tempdict = {}
    tempdict['userid'] = userid
    for i in range(no_of_features):
        tempdict["view"+str(i+1)] = viewList[i]
#     tempdict['view1'] = viewList[0]
#     tempdict['view2'] = viewList[1]
#     tempdict['view3'] = viewList[2]
    tempdict['addtocart'] = addToCart
    
    return tempdict.copy()

def addEntrytoUserDFforMultipleAddtoCart(userDF, userid, viewList, addToCartList):
    for addToCart in addToCartList:
        new_entry = addEntrytoUserDF(userid, viewList, addToCart)
        userDF = userDF.append(new_entry, ignore_index=True)
    
    return userDF.copy()

def addEntrytoUserDFforMultipleView(userDF, userid, viewList, addToCartList):
    remainder = int(len(viewList)%no_of_features)
    if not remainder == 0:
        viewList.extend(np.zeros(no_of_features - remainder))
#     print ("ViewList extented : {0}".format(viewList))
    start = 0
    end = no_of_features-1
    while end<=len(viewList):        
        userDF = addEntrytoUserDFforMultipleAddtoCart(userDF, userid, viewList[start:end+1], addToCartList)
#         print ("start : {0}, end : {1}".format(start,end))
        start = end+1
        end = end+no_of_features
    
    return userDF.copy()
        
def buildMatrix(userDict):
    columns=[]
    columns.append("userid")
    for i in range(no_of_features):
        columns.append("view" + str(i+1))
    columns.append("addtocart")
    userDF = pd.DataFrame(columns=columns)
    c= 0
    for key, value in userDict.items():
#         print (userDict[key])
        userDF = addEntrytoUserDFforMultipleView(userDF, key, userDict[key]['view'].copy(), userDict[key]['addtocart'].copy())
#         print (userDF)
    return userDF
        
        
userDF

Unnamed: 0,userid,view1,view2,view3,view4,addtocart


In [None]:
userDF_2 = buildMatrix(userDict)
userDF_2

In [246]:
addEntrytoUserDFforMultipleView(userDF, 1179650, [344723, 344723, 253615, 344723, 1111, 123123], [123])

Unnamed: 0,userid,view1,view2,view3,view4,addtocart
0,1179650.0,344723,344723,253615,344723,123
1,1179650.0,1111,123123,0,0,123


In [187]:
l=[452955, 452955, 186933, 123, 1234, 12345]
print (len(l))
remainder = int(len(l)%3)
print (remainder)
print (np.zeros(remainder))
l.extend(np.zeros(remainder))
l

6
0
[]


[452955, 452955, 186933, 123, 1234, 12345]

In [206]:
userDF

Unnamed: 0,userid,view1,view2,view3,addtocart


In [191]:
l[3:4]

[123]