In [1]:
import numpy as np
from scipy import spatial
import time
import pandas as pd
import pickle
import collections
from collections import OrderedDict
from operator import itemgetter
from pyspark.sql.functions import udf,col,cos,sin
from pyspark.sql.types import *
from pyspark.sql.dataframe import *
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import monotonically_increasing_id


In [2]:
np.__version__

'1.16.2'

In [3]:
#provide needed inputs
cust_account_num=8773100280335924

In [4]:
#known products and their PCAs - master for nneighb search
obj_sql = "select * from advanlwork.rgx_knowns" 
obj_ds = spark.sql(obj_sql)

In [5]:
#customer dimensions - looking for recommendation for these
cust_sql = str('select pca_device as pca1,pca_viewing as pca2,pca_tickets as pca3,pca_truckroll as pca4,pca_spectra as pca5,pca_network as pca6,pca_costs as pca7,pca_ivr as pca8,pca_pscs as pca9,pca_consumer as pca10,pca_census as pca11,pca_email as pca12,pca_call as pca13 from jberry003.customer_pca where account=')+str(cust_account_num) 
cust_ds = spark.sql(cust_sql)

In [6]:
cust_ds_pd=cust_ds.toPandas()

In [7]:
#product weights - decided by business
prod_sql = "select * from advanlwork.rgx_product" 
prod_ds = spark.sql(prod_sql)

In [8]:
prod_ds_pd=prod_ds.toPandas()
prod_ds_pd.head()

Unnamed: 0,prod,w_stats,w_rev,w_season
0,BOXING EVENT,1.0,135.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,PRO WRESTLING EVENT,1.0,160.0,"[0.0, 0.0, 0.0, 2.4, 2.4, 0.0, 2.4, 0.0, 2.4, ..."
2,HISTORY VAULT,1.0,5.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,GAIAM TV FIT & YOGA,1.0,7.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,UP FAITH & FAMILY,1.0,6.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [9]:
#index of current month
index_curr=pd.to_datetime('today').month-1
index_curr

4

In [10]:
#get seasonal weight for current month
prod_ds_pd['w_season1']=prod_ds_pd['w_season'].apply(lambda x:x[index_curr])
prod_ds_pd.drop(['w_season'],inplace=True,axis=1)
prod_ds_pd.head()

Unnamed: 0,prod,w_stats,w_rev,w_season1
0,BOXING EVENT,1.0,135.0,1.0
1,PRO WRESTLING EVENT,1.0,160.0,2.4
2,HISTORY VAULT,1.0,5.0,1.0
3,GAIAM TV FIT & YOGA,1.0,7.0,1.0
4,UP FAITH & FAMILY,1.0,6.0,1.0


In [11]:
#make min season weight as 1
prod_ds_pd['w_season1']=prod_ds_pd['w_season1'].apply(lambda x:x if x>=1 else 1) 

#weight computation
prod_ds_pd['decision_weight']=prod_ds_pd['w_stats']*prod_ds_pd['w_rev']*prod_ds_pd['w_season1'] 

In [12]:
#create weighted dictionary
weight_dict={}
for i in range(prod_ds_pd.shape[0]):
         weight_dict[prod_ds_pd['prod'][i]] = prod_ds_pd['decision_weight'][i]


In [13]:
obj_ds_pd=obj_ds.toPandas()

In [14]:
obj_ds_pd[['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10','pca11','pca12','pca13']] = pd.DataFrame(obj_ds_pd.customer_dims.values.tolist(), index= obj_ds_pd.index)

In [15]:
obj_ds_pd.drop(['customer_dims'],inplace=True,axis=1)

In [16]:
obj_ds_pd.head()

Unnamed: 0,account_number,prod,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,pca11,pca12,pca13
0,8993110620320548,BOXING EVENT,0.35336,0.004388,0.006781,0.0,0.240028,0.0,0.001009,0.00123,0.638791,0.0,0.731868,0.001205,0.000225
1,8155100350728428,BOXING EVENT,0.226638,0.224098,0.001349,0.0,0.159882,0.0,0.00322,0.000435,0.107812,0.0,0.389141,0.006321,0.000111
2,8499051000009476,BOXING EVENT,0.0,0.432565,0.004603,0.0,0.0,0.0,0.003728,0.002936,0.235647,0.22016,0.304523,0.0,0.000156
3,8155200510682273,BOXING EVENT,0.142542,0.216289,0.000302,0.0,0.102269,0.0,0.001537,0.000338,0.12614,0.0,0.24502,0.014248,2.8e-05
4,8155600230220983,BOXING EVENT,0.157175,0.392259,0.000194,0.0,0.102438,0.0,0.001557,0.0,0.564833,0.0,0.416608,0.017176,0.0


In [17]:
#dictionary for looking for products of nneighb
d_prod_indexing = {} 
for i in range(obj_ds_pd.shape[0]):
         d_prod_indexing[i] = obj_ds_pd['prod'][i]

In [18]:
d_prod_indexing[0]

u'BOXING EVENT'

In [19]:
#fuction for freq weight
def func_freq(inp):
    d_tmp={}
    for k,v in inp.items():
        d_tmp[k]=v/5.0 #used 5.0 instead of 5 because Python 2.0 returns integer instead of float while dividing
    return d_tmp

#weighted results, sorting and restring to top n
def weighted_fetch(dict1,dict2,limit):
    d1={}
    for k,v in dict1.items():
        d1[k]=dict2[k]*v
    sorted_d1 = OrderedDict(sorted(d1.items(), key = itemgetter(1), reverse = True))
    l=[]
    [l.extend([k]) for k,v in sorted_d1.items()]
    l1=l[0:limit]  
    return l1

In [20]:
#kd-tree creation for querying
start=time.time()
obj=np.array(obj_ds_pd[['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10','pca11','pca12','pca13']])
tree = spatial.cKDTree(obj,leafsize=4)
print("Done! Overall Took " + str(round(((time.time())-start)/60)) + " minutes")

Done! Overall Took 0.0 minutes


In [21]:
nneib=tree.query(cust_ds_pd,100)

In [22]:
nneib[1][0][0]

12706

In [23]:
recommended_prods=weighted_fetch(func_freq(collections.Counter([d_prod_indexing[i] for i in nneib[1][0]])),weight_dict,100)
recommended_prods

[u'NHL CENTER ICE',
 u'MLB EXTRA INNINGS',
 u'PRO WRESTLING EVENT',
 u'BOXING EVENT',
 u'XFI PODS',
 u'NBA LEAGUE PASS',
 u'STINGRAY KARAOKE',
 u'HISTORY VAULT',
 u'GAIAM TV FIT & YOGA',
 u'GAIA',
 u'ACORN SVOD',
 u'DOGTV',
 u'UP FAITH & FAMILY',
 u'LIFETIME MOVIE CLUB',
 u'AMC PREMIERE SVOD',
 u'MLS DIRECT KICK',
 u'FX+ SVOD',
 u'DISNEY FAMILY MOVIES',
 u'PANTAYA SVOD',
 u'EROS NOW',
 u'URBAN MOVIE CHANNEL SVOD']