In [1]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import random
import pickle
from google.cloud import bigquery
import math
import re
#!pip install xgboost
import xgboost as xgb
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
num_folds = 4
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt 
import seaborn as sns
#!pip install gensim
from gensim.models import Word2Vec
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer 
from google.colab import drive
drive.mount('/content/drive')
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


In [2]:
def click_model(df_ip,num_pos=[5,10,15], model = 'lr'):
    '''
    Objective:
    * A click model is used for obtaining a relationship between impressions at different imp_positions and total click
    * The predicted clicks from the model used in conjunction with actual clicks to obtain a relevancy score
    * Relevance score = CEIL(4 * (actual clicks/predicted clicks))
    
    Input parameters:
    df_ip: Input dataframe
    num_pos: positions upto which the imp_postions to be considered while building click model
    model: type of model  - 'lr' = linear regression, 'rf' = random forest
    
    Output:
    * A dataframe containing the accuracy results for each of num_pos
    * Output dataframe containing the predicted clicks
    
    '''
    
    
    #df_results = pd.DataFrame(columns=['imp_positions','train_rmse','test_rmse','train_r2','test_r2','train_MAPE','test_MAPE'])
    df_results = pd.DataFrame(columns=['imp_positions','train_rmse','train_r2','train_MAPE'])
    
    for i in num_pos:
        result = []
        df = df_ip[df_ip['imp_position']<=i]
        df.reset_index(drop=True,inplace=True)
        df[['imp_position','total_impressions','total_clicks']] = df[['imp_position','total_impressions','total_clicks']].astype(int)
        df_pivot = df.pivot_table(index=['imp_path','sku'],values = 'total_impressions', columns= 'imp_position' ).fillna(0).reset_index()
        click_by_item_pos = df.groupby(['imp_path','sku'],as_index=False)['total_clicks'].sum()
        df_pivot = df_pivot.merge(click_by_item_pos, on = ['imp_path','sku'],how='left')
        df_pivot.sort_values(['imp_path','total_clicks'], ascending=[True,False],inplace=True)

        df_pivot.reset_index(drop = True, inplace=True)
        df_train = df_pivot[df_pivot.columns[2:]]

        # separate the independent and target variable 
        train_X = df_train.drop(columns=['total_clicks'],axis=1)
        train_Y = df_train['total_clicks']
        
        #for testing
        #train_x, test_x, train_y, test_y = train_test_split(train_X, train_Y,test_size=0.25,random_state=0)    
        
        # create an object of the LinearRegression Model
        if model == 'lr':
            click_model = LinearRegression()

        # fit the model with the training data
            click_model.fit(train_X, train_Y)

        # predict the target on train and test data
         
            predict_train = click_model.predict(train_X)
            #predict_test  = click_model.predict(test_x)
            
        elif model== 'rf':
            click_model = RandomForestRegressor(n_estimators=200, max_depth=10)
            # fit the model with the training data
            click_model.fit(train_X, train_Y)
            # predict the target on train and test data 
            
            predict_train = click_model.predict(train_X)
            #predict_test  = click_model.predict(test_x)
            
        
        result.append(i)
        result.append(mean_squared_error(train_Y, predict_train)**(0.5))
        #result.append(mean_squared_error(test_y, predict_test)**(0.5))
        result.append(r2_score(train_Y, predict_train))
        #result.append(r2_score(test_y, predict_test))
        result.append(MAPE(train_Y,predict_train))
        #result.append(MAPE(test_y,predict_test))
        df_results.loc[len(df_results)] = result
    
    relevance_df = pd.concat([df_pivot,pd.DataFrame(predict_train,columns=['predicted'])],axis=1)
    return click_model,df_results,relevance_df


def MAPE(y_act,y_pred):
    y_act = np.array(y_act)
    y_pred = np.array(y_pred)
    acc = []
    for i in range(len(y_act)):
        if (y_act[i] !=0):
            acc.append(np.round((abs(y_act[i]-y_pred[i])/y_act[i])*100))
    return np.round(((sum(acc)/len(acc))/100),2)


def click_model_scoring(df_ip,model,num_pos=45):
    '''
    Objective:
    * Click model for scoring
    
    Input parameters:
    df_ip: Input dataframe
    num_pos: positions upto which the imp_postions to be considered while building click model
    model: trained click model
    
    Output:
    * Output dataframe containing the predicted clicks
    
    '''
    
    df = df_ip[df_ip['imp_position']<=num_pos]
    df.reset_index(drop=True,inplace=True)
    df_pivot = df.pivot_table(index=['imp_path','sku'],values = 'total_impressions', columns= 'imp_position' ).fillna(0).reset_index()
    click_by_item_pos = df.groupby(['imp_path','sku'],as_index=False)['total_clicks'].sum()
    df_pivot = df_pivot.merge(click_by_item_pos, on = ['imp_path','sku'],how='left')
    df_pivot.sort_values(['imp_path','total_clicks'], ascending=[True,False],inplace=True)

    df_pivot.reset_index(drop = True, inplace=True)
    df_test = df_pivot[df_pivot.columns[2:]]

    # separate the independent and target variable 
    test_X = df_test.drop(columns=['total_clicks'],axis=1)
    test_Y = df_test['total_clicks']

    predict_test  = model.predict(test_X)

    return pd.concat([df_pivot,pd.DataFrame(predict_test,columns=['predicted'])],axis=1)    




def sort_and_process_relevance_data(relevance_df,pos=50):
    
    '''
    objective: 
    Process the output of click model so that it is ready to be joined with feature set
    
    Input parameters:
    Output of click model
    
    Output:
    Processed dataframe
    '''
    
    relevance_df['ratio'] = np.round((relevance_df['total_clicks']/relevance_df['predicted']),3)
    #Sorting the output df based on query and ratio
    relevance_df['total_imps'] = relevance_df.iloc[:,2:pos+2].sum(axis=1)
    if pos>=10:
        relevance_df['total_imps_1_10'] = relevance_df.iloc[:,2:11].sum(axis=1)
    else:
        relevance_df['total_imps_1_10'] = 0
    
    if pos>=20:
        relevance_df['total_imps_10_20'] = relevance_df.iloc[:,11:21].sum(axis=1)
    else:
        relevance_df['total_imps_10_20'] = 0
        
    if pos>=30:
        relevance_df['total_imps_20_30'] = relevance_df.iloc[:,21:31].sum(axis=1)
    else:
        relevance_df['total_imps_20_30'] = 0
        
    if pos>=40:
        relevance_df['total_imps_30_40'] = relevance_df.iloc[:,31:41].sum(axis=1)
    else:
        relevance_df['total_imps_30_40'] = 0
        
    if pos==50:
        relevance_df['total_imps_40_50'] = relevance_df.iloc[:,41:50].sum(axis=1)
    else:
        relevance_df['total_imps_40_50'] = 0 
        
    relevance_df = relevance_df.sort_values(['imp_path','ratio'],ascending=[True,False])[['imp_path','sku','total_clicks','total_imps','total_imps_1_10',
                                                                                          'total_imps_10_20','total_imps_20_30','total_imps_30_40','total_imps_40_50','predicted','ratio']]
    
    OL_detection = relevance_df.groupby(['imp_path'],as_index=False)['ratio'].quantile(0.95).rename({'ratio':'quantile_95_ratio'},axis=1)
    relevance_df = relevance_df.merge(OL_detection,on='imp_path',how='left')
    relevance_df = relevance_df[relevance_df['ratio']<=relevance_df['quantile_95_ratio']]
    relevance_df.reset_index(drop=True,inplace=True)
    temp = relevance_df.groupby(['imp_path'],as_index=False)['ratio'].max().rename({'ratio':'max_ratio'},axis=1)
    relevance_df = relevance_df.merge(temp,on=['imp_path'])
    relevance_df['relevance_score'] = np.ceil(5*(relevance_df['ratio']/relevance_df['max_ratio']))
    #relevance_df = relevance_df[['imp_path','sku','total_clicks','ratio','relevance_score']]
    relevance_df = relevance_df[relevance_df['relevance_score']>=0]
    return relevance_df


def relevance_feature_join(df_feature,relevance_df):
    '''
    Objective:
    Joining the product feature set with processed click-model output data
    
    Input parameters:
    
    df_feature= feature set dataframe
    relevance_df = processed output of click-model
    
    Output:
    Combined dataframe
    
    '''
    df = relevance_df.merge(df_feature,left_on='sku',right_on='sku_config',how='left')
    df.dropna(inplace=True)
    temp = df.groupby(['imp_path'],as_index=False)['sku'].nunique().sort_values('sku',ascending=False)
    fin_training_queries = list(temp[temp['sku']>=5]['imp_path'])
    return df[df['imp_path'].isin(fin_training_queries)].reset_index(drop=True)



In [4]:
#Inputs
df_query1 =  pd.read_pickle('drive/MyDrive/Training Data/stage 1/df_query1.pkl')
df_feature = pd.read_csv('drive/MyDrive/Training Data/stage 2/cleaned_prod_feature_set.csv')

#Initial filtering of data before click model
#Top queries with max clicks
temp = df_query1.groupby(['imp_path'],as_index=False)['total_clicks'].sum().sort_values('total_clicks',ascending=False).reset_index(drop=True)
exclusion = ['-','/','cart']
temp = temp[~temp['imp_path'].isin(exclusion)]
temp['cum_sum_clicks'] = temp['total_clicks'].cumsum()
temp['contribution_to_overall_clicks'] = (temp['cum_sum_clicks']/temp['cum_sum_clicks'].max())

queries = list(temp[temp['contribution_to_overall_clicks']<=0.90]['imp_path'].unique())

temp = df_query1.groupby(['imp_path','sku'],as_index=False)['total_clicks'].sum().sort_values(['imp_path','total_clicks'],ascending=[False,False]).reset_index(drop=True)
temp = temp[temp['imp_path'].isin(queries)]
temp['click_cumsum_query'] = temp.groupby(['imp_path'],as_index=False)['total_clicks'].cumsum()
temp = temp.merge(temp.groupby(['imp_path'],as_index=False)['click_cumsum_query'].max().rename({'click_cumsum_query':'grand_total_click_query'},axis=1),on=['imp_path'])
temp['perc_contribution'] = temp['click_cumsum_query']/temp['grand_total_click_query']
temp = temp[(temp['perc_contribution']<0.9)&(temp['total_clicks'] > 0)][['imp_path','sku']].drop_duplicates()

df1 = df_query1.merge(temp,on=['imp_path','sku'])
df1['imp_position'] = df1['imp_position'].astype(np.int64)

#Running click model
click__through_model,accuracy, relevance_df = click_model(df1,num_pos=[50], model = 'lr')
print(relevance_df.shape)
print(accuracy)

relevance_df2 = sort_and_process_relevance_data(relevance_df,pos=50)


df2 = relevance_feature_join(df_feature,relevance_df2)

#Outputs
pickle.dump(click__through_model, open('drive/MyDrive/Training Data/stage 3/click_through_model.pkl', 'wb'))
relevance_df.to_csv('drive/MyDrive/Training Data/stage 3/unprocessed_click_model_op_ts.pkl')
relevance_df2.to_csv('drive/MyDrive/Training Data/stage 3/processed_click_model_op_ts.csv')
df2.to_csv('drive/MyDrive/Training Data/stage 3/relevance_feature_added_ts.csv')

(248539, 54)
   imp_positions  train_rmse  train_r2  train_MAPE
0           50.0   23.522968  0.523779        1.51
