In [None]:
import YahooNewsDataExtraction as tool
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import pickle

In [None]:
threshold = 50
train_val_split = 0.5

# Training set Processing

In [None]:
'''Read in train data'''
'''Col0 - Col4 are features, Col5 is adID, Col6 is click binary indicator.'''
train_records = np.vstack([np.load('day' + str(day) + '_records.npy') for day in range(1,6)]) 


'''Read in ads dictionary.'''
train_adsDict = dict()
for day in range(1,6):
    train_adsDict.update(pickle.load(open('day' + str(day) + '_adsDict.p','rb')))

    
'''Add user type to each user interaction record.'''
filename = 'train_userCluster.p'
user_kmeans = pickle.load(open(filename,'rb'))
train_recordsDF = pd.DataFrame(np.hstack([train_records,user_kmeans.labels_.reshape(-1,1)])).rename(columns 
                                                                                                    ={5:'adID', 6: 'click',7: 'userType'})

In [None]:
'''Cluster articles into different types.'''
train_adsDF = pd.DataFrame(train_adsDict).T
n_adsClusters = 7
ads_kmeans = KMeans(n_clusters=n_adsClusters, random_state=300)
ads_kmeans.fit(train_adsDF)
ads_kmeans.labels_
train_adsType = dict(zip(train_adsDF.dropna().index, ads_kmeans.labels_))

filename = 'train_adsCluster_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.p'
pickle.dump(ads_kmeans,open(filename,'wb'))

In [None]:
'''Add ad type to each user interaction record.'''
train_validation_recordDFwType= pd.concat([train_recordsDF, 
                          train_recordsDF['adID'].map(train_adsType).rename('adsType')],axis =1).dropna()


In [None]:
'''Split train data into training and validation sets. Length ratio is given by train_val_split.'''
from sklearn.model_selection import train_test_split
train_recordDFwType, validation_recordDFwType, = train_test_split(train_validation_recordDFwType,
                                                                  test_size=train_val_split, random_state=42)

In [None]:
'''Calculate training set click probability for each pair of user type and article type in training set.'''
train_recordDFwType['adsType'] = train_recordDFwType['adsType'].astype(int)
train_Y_clickProb = train_recordDFwType[['click','userType','adsType']].groupby(['userType','adsType'])['click'].agg({'clickProb':'mean',
                                                            'n_obs':'count'})

In [None]:
'''Drop article type 3 and several interaction records so average click probabilities for 
each user and article type were calculated with at least 50 interaction records. '''

train_tmp = train_Y_clickProb['n_obs'].unstack().drop([3], axis =1).min(axis = 1)
train_tmp_index = train_tmp[train_tmp >= threshold].index

In [None]:
filename = 'filtered_train_clickprob_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, train_Y_clickProb['clickProb'].unstack().loc[train_tmp_index].drop([3],axis = 1).values) 


filename = 'filtered_train_usernumobserv_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, train_recordDFwType.groupby('userType').size().loc[train_tmp_index].values) 


train_X_user = train_recordDFwType.groupby(['userType']).mean().drop(['adID','click','adsType'],axis =1)
filename = 'filtered_train_userFeat_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, train_X_user.loc[train_tmp_index].values)


filename = 'filtered_train_featMap_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
pickle.dump(dict(zip(list(train_X_user.index), train_X_user.values)), open(filename, 'wb'))


# Validation set Processing

In [None]:
filename = 'filtered_train_featMap_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
train_featMap = pickle.load(open(filename, 'rb'))
validation_recordDFwType['adsType']  = validation_recordDFwType['adID'].map(train_adsType)

In [None]:
'''Calculate training set click probability for each pair of user type and article type in validation set.'''
validation_Y_clickProb = validation_recordDFwType[['click','userType','adsType']].groupby(['userType','adsType'])['click'].agg({'clickProb':'mean',
                                                            'n_obs':'count'})

In [None]:
'''Drop article type 3 and several interaction records so average click probabilities for 
each user and article type were calculated with at least 50 interaction records. '''
validation_tmp = validation_Y_clickProb['n_obs'].unstack().drop([3], axis =1).min(axis = 1)
validation_tmp_index = validation_tmp[validation_tmp >= threshold].index

In [None]:
filename = 'filtered_validation_clickprob_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, validation_Y_clickProb['clickProb'].unstack().loc[validation_tmp_index].drop([3],axis = 1).values) 

filename = 'filtered_validation_usernumobserv_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, validation_recordDFwType.groupby('userType').size().loc[validation_tmp_index].values) 

validation_X_user = np.vstack([train_featMap[x] for x in list(validation_tmp_index)])
filename = 'filtered_validation_userFeat_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, validation_X_user) 

# Test set Processing

In [None]:
'''Read in test data'''
'''Col0 - Col4 are features, Col5 is adID, Col6 is click binary indicator.'''
test_records = np.vstack([np.load('day' + str(day) + '_records.npy') for day in range(6,11)]) 

'''Read in ads dictionary.'''
test_adsDict = dict()
for day in range(6,11):
    test_adsDict.update(pickle.load(open('day' + str(day) + '_adsDict.p','rb')))

filename = 'train_userCluster.p'
user_kmeans = pickle.load(open(filename,'rb'))

filename = 'test_userTypePredictions.npy'
test_usertype_predictions = np.load(filename)

filename = 'train_adsCluster_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.p'
ads_kmeans =  pickle.load(open(filename,'rb'))

filename = 'filtered_train_featMap_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
train_featMap = pickle.load(open(filename, 'rb'))

In [None]:
test_adsDF = pd.DataFrame(test_adsDict).T
test_recordDFwType = pd.DataFrame(np.hstack([test_records,test_usertype_predictions.reshape(-1,1)])).rename(columns =                     
                                                                                                            {5:'adID', 6: 'click',7: 'userType'})
test_adsType = dict(zip(test_adsDF.index, ads_kmeans.predict(test_adsDF)))
test_recordDFwType['adsType']  = test_recordDFwType['adID'].map(test_adsType)

In [None]:
'''Calculate training set click probability for each pair of user type and article type in test set.'''
test_Y_clickProb = test_recordDFwType[['click','userType','adsType']].groupby(['userType','adsType'])['click'].agg({'clickProb':'mean',
                                                            'n_obs':'count'})

In [None]:
'''Drop article type 3 and several interaction records. '''
test_tmp = test_Y_clickProb['n_obs'].unstack().drop([3], axis =1).min(axis = 1)
test_tmp_index = test_tmp[test_tmp >= threshold].index


In [None]:
filename = 'filtered_test_clickprob_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, test_Y_clickProb['clickProb'].unstack().loc[test_tmp_index].drop([3],axis = 1).values) 

filename = 'filtered_test_usernumobserv_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, test_recordDFwType.groupby('userType').size().loc[test_tmp_index].values) 

test_X_user = np.vstack([train_featMap[x] for x in list(test_tmp_index)])
filename = 'filtered_test_userFeat_' + str(train_val_split * 100) + '%'+'_'+ str(threshold) + '.npy'
np.save(filename, test_X_user) 