In [1]:
import pandas as pd
from numpy import array
import datetime as dt

from pyspark.sql import SQLContext
from pyspark.mllib.clustering import KMeans, KMeansModel

import utils
%matplotlib inline

# Step 1: Attribute Selection

Read the contents of `users.csv` and calculate user age based on dob

In [2]:
gameClicksDF = pd.read_csv('./flamingo-data/game-clicks.csv')
gameClicksDF = gameClicksDF.rename(columns=lambda x: x.strip())
# select useful features
selectGameClicksDF = gameClicksDF[['userId', 'userSessionId', 'teamId', 'teamLevel', 'isHit']]
# aggregate to get hit count
hitsPerUserSession = selectGameClicksDF.groupby(['userId','userSessionId', 'teamId', 'teamLevel']).sum()
hitsPerUserSession = hitsPerUserSession.reset_index()
hitsPerUserSession.columns = ['userId','userSessionId', 'teamId', 'teamLevel', 'hitCounts'] #rename the columns

hitsPerUserSession.sort(['userId'])





Unnamed: 0,userId,userSessionId,teamId,teamLevel,hitCounts
0,0,23473,157,1,28
1,0,24943,157,2,35
2,0,28377,157,3,32
3,0,32029,157,4,29
4,0,37439,157,5,19
12,1,34802,99,8,12
11,1,26938,99,7,16
10,1,21014,99,6,11
9,1,16232,99,5,13
7,1,10041,99,3,9


Read the contents of the file `buy-clicks.csv` and removes any extra whitespaces.

In [3]:
buyclicksDF = pd.read_csv('./flamingo-data/buy-clicks.csv')
buyclicksDF = buyclicksDF.rename(columns=lambda x: x.strip()) #removes whitespaces from headers

# select useful fields
selectBuyclicksDF = buyclicksDF[['userId', 'userSessionId', 'team', 'price']]
selectBuyclicksDF = selectBuyclicksDF.rename(columns = {'team':'teamId'})
# selectBuyclicksDF.sort(['userId', 'userSessionId'])

# aggregate to get average buys per user
buysPerUserSession = selectBuyclicksDF.groupby(['userId', 'userSessionId', 'teamId']).mean()
buysPerUserSession = buysPerUserSession.reset_index()
buysPerUserSession.columns = ['userId', 'userSessionId', 'teamId', 'avgPrice']
buysPerUserSession.sort(['userId', 'userSessionId'])




Unnamed: 0,userId,userSessionId,teamId,avgPrice
0,1,10041,99,3.000000
1,1,12713,99,2.000000
2,1,21014,99,2.500000
3,1,26938,99,2.000000
4,1,34802,99,2.000000
5,8,27918,124,11.000000
6,8,33190,124,10.000000
7,9,9187,97,10.000000
8,9,12710,97,10.000000
9,9,16228,97,15.000000


Join buying user age to user session df

In [4]:
combinedDF = pd.merge(hitsPerUserSession, buysPerUserSession, on = ['userId', 'userSessionId', 'teamId'])
combinedDF.sort(['userId', 'userSessionId'])


  from ipykernel import kernelapp as app


Unnamed: 0,userId,userSessionId,teamId,teamLevel,hitCounts,avgPrice
0,1,10041,99,3,9,3.000000
1,1,12713,99,4,14,2.000000
2,1,21014,99,6,11,2.500000
3,1,26938,99,7,16,2.000000
4,1,34802,99,8,12,2.000000
5,8,27918,124,5,23,11.000000
6,8,33190,124,6,14,10.000000
7,9,9187,97,2,1,10.000000
8,9,12710,97,4,11,10.000000
9,9,16228,97,5,8,15.000000


Filter out NaN rows and select features (age, totalAdClicks, totalPrice)

In [5]:
cleanedDF = combinedDF.dropna()
trainingDF = cleanedDF[['teamLevel', 'hitCounts', 'avgPrice']]
trainingDF.head(5)

Unnamed: 0,teamLevel,hitCounts,avgPrice
0,3,9,3.0
1,4,14,2.0
2,6,11,2.5
3,7,16,2.0
4,8,12,2.0


In [6]:
trainingDF.shape

(1749, 3)

In [7]:
sqlContext = SQLContext(sc)
pDF = sqlContext.createDataFrame(trainingDF)
parsedData = pDF.rdd.map(lambda line: array([line[0], line[1], line[2]])) #'age', 'totalAdClicks','totalPrice'

In [8]:
pDF.describe().toPandas()

Unnamed: 0,summary,teamLevel,hitCounts,avgPrice
0,count,1749.0,1749.0,1749.0
1,mean,5.35620354488279,15.580903373356204,7.281170192490949
2,stddev,1.9756940082869676,12.889155193769644,6.512146813285836
3,min,1.0,0.0,1.0
4,max,8.0,100.0,20.0


In [17]:
my_kmmodel = KMeans.train(parsedData, 10, maxIterations=10, runs=10, initializationMode="random", seed = 1234)

  "Support for runs is deprecated in 1.6.0. This param will have no effect in 1.7.0.")


In [18]:
centers = my_kmmodel.centers
centers

[array([  5.44983819,  10.82524272,   2.6639698 ]),
 array([ 5.45508982,  5.45508982,  2.64171657]),
 array([  5.30088496,   5.65486726,  10.86946903]),
 array([  4.2920354 ,  41.98230088,   6.85412979]),
 array([  5.11805556,  26.9375    ,   4.78043981]),
 array([  3.        ,  70.94285714,   6.32857143]),
 array([  6.08152174,  14.01086957,  11.36766304]),
 array([  5.29752066,  21.09917355,  18.42355372]),
 array([  5.41509434,   8.0754717 ,  19.53092243]),
 array([  5.54852321,  16.55696203,   2.91265823])]