In [1]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import random
import pickle
from google.cloud import bigquery
import math
import re
#!pip install xgboost
import xgboost as xgb
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
num_folds = 4
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt 
import seaborn as sns
#!pip install gensim
from gensim.models import Word2Vec
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer 
pd.options.mode.chained_assignment = None
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


In [12]:
def common_words(df, col1, col2):
      """
      Returns common words between each row of col1 and col2 of df in the form of a list. 
      Length of list is number of rows in dataframe
      """
      common_list = []
      for i, row in df[[col1,col2]].iterrows():
        set1 = set(str(row[col1]).split())
        set2 = set(str(row[col2]).split())
        common = set1 & set2
        common = ' '.join(common)
        common_list.append(common)
      return common_list

def cosine_similarity_sent(sent1, sent2):
      """
      Cosine Similarity between 2 sentences treating them as sets of words
      """

      set1 = set(str(sent1).split())
      set2 = set(str(sent2).split())
      numerator = len(set1 & set2)
      denominator = math.sqrt(len(set1)) * math.sqrt(len(set2))

      if not denominator:
          return 0.0
      else:
          return numerator / denominator
    
def jacquard_coefficient_sent(sent1, sent2):
      """
      Jacquard Coefficient between 2 sentences treating them as sets of words
      """

      set1 = set(str(sent1).split())
      set2 = set(str(sent2).split())
      numerator = len(set1 & set2)
      denominator = len(set1 | set2)

      if not denominator:
          return 0.0
      else:
          return numerator / denominator

#Calculating the distance columns

#Common words between first part of query and all other product hierarch actual columns
def common_word_for_colums(df,x,y):
    
    for col in y:
        df = pd.concat([df,pd.Series(common_words(df,x,col))],axis=1).rename({0:str(col)+"_q_common_words"},axis=1)
        df[str(col)+"_"+str(x)+"_common_words"] = 0
        df[str(col)+"_"+str(x)+"_common_words"] = df[str(col)+"_"+str(x)+"_common_words"].apply(lambda x : len(str(x).split()))
    return df

def cosine_similarity_for_columns(df,col1,col2):
    for col in col2:
        df[str(col)+"_"+str(col1)+"_cosine_similarity"] = 0
        df[str(col)+"_"+str(col1)+"_cosine_similarity"] = df.apply(lambda x : cosine_similarity_sent(x[col1],x[col]),axis=1) 
    return df

def to_lower_cols(df,cols):
    for col in cols:
        df[col] = df[col].apply(lambda x: x.lower())
    return df

def remove_spl_chars(df,cols):
    for col in cols:
        df[col] = df[col].apply(lambda x: re.sub('[^a-zA-Z0-9 \n\.]', ' ', x))
    return df


def query_product_similarity_based_features(df):
    
    '''
    Objective:
    * Processing the features to calucate similarity based features between query and product description
    * Calcualting the similary between product description and query
    
    '''

    
    #Extracting features from queries and product description
    df['first_part_query'] = df['imp_path'].apply(lambda x: x.split("/")[1].lower())
    df['second_part_query'] = df['imp_path'].apply(lambda x: x.split("/")[2].lower())

    df =  remove_spl_chars(df,cols=['first_part_query','second_part_query'])

    df['gender_query'] = np.where(df['gender_clean']=='Female','women','men')

    df = to_lower_cols(df,cols=['gender_query','department','category_clean','subcategory_clean','brand_clean','occasion','basic_type','product_detail','merch_team_clean','color'])

    df['new_arrival'] = np.where(df['time_since_launch']<=30, "new arrivals","old")

    df['prod_hierarychy_actual'] = df['gender_query'] + " " + df['department'] + " " + df['category_clean'] + " " + df['subcategory_clean']
    
    df = common_word_for_colums(df = df,x='first_part_query', y=['prod_hierarychy_actual','gender_query','department','category_clean','subcategory_clean','brand_clean','occasion','basic_type','product_detail','merch_team_clean','color'])    
    df = cosine_similarity_for_columns(df=df,col1='first_part_query',col2 = ['prod_hierarychy_actual','gender_query','department','category_clean','subcategory_clean','brand_clean','occasion','basic_type','product_detail','merch_team_clean','color'])
    df = common_word_for_colums(df = df,x='second_part_query', y=['prod_hierarychy_actual','gender_query','department','category_clean','subcategory_clean','brand_clean','occasion','basic_type','product_detail','merch_team_clean','color'])    
    df = cosine_similarity_for_columns(df=df,col1='second_part_query',col2 = ['prod_hierarychy_actual','gender_query','department','category_clean','subcategory_clean','brand_clean','occasion','basic_type','product_detail','merch_team_clean','color']) 
    
    return df

def calc_ctr(df,level,name):
    '''
    Objective: 
    Calcualting the click through rate, total impressions, total clicks at given level
    
    '''
    temp=df.groupby(level,as_index=False)['total_impressions_lp','total_clicks_lp'].sum()
    temp[['total_impressions_lp','total_clicks_lp']] = temp[['total_impressions_lp','total_clicks_lp']].astype(int)
    temp[str(name)+'_ctr'] = np.round(np.where(temp['total_impressions_lp']==0,0,temp['total_clicks_lp']/temp['total_impressions_lp'])*100,2)
    temp.rename({'total_impressions_lp': str(name)+'_impression_lp','total_clicks_lp': str(name)+'clicks_lp' },axis=1,inplace=True)
    #temp.drop(['total_impressions_lp','total_clicks_lp'],axis=1,inplace=True)
    df = df.merge(temp,on=level,how='left')
    return df

def historical_performance_features(df_sql3, df):
    
    '''
    Objective:
    To create historical performance based features at different levels
    
    '''
    
    #Calculating total impressions and click at query and product level
    temp = df_sql3.groupby(['imp_path','sku'],as_index=False)['total_impressions','total_clicks'].sum().rename({'total_impressions':'total_impressions_lp','total_clicks':'total_clicks_lp'},axis=1)
    
    #Aggregation imp_position level
    temp2 = df_sql3.groupby(['imp_path','sku','imp_position'],as_index=False)['total_impressions','total_clicks'].sum().rename({'total_impressions':'total_impressions_lp','total_clicks':'total_clicks_lp'},axis=1)
    
    temp2['imp_bucket'] = np.select([temp2['imp_position'] <=10,
                                     (temp2['imp_position'] >10) & (temp2['imp_position'] <=20),
                                     (temp2['imp_position'] >20) & (temp2['imp_position'] <=30),
                                     (temp2['imp_position'] >30) & (temp2['imp_position'] <=40),
                                     (temp2['imp_position'] >40) & (temp2['imp_position'] <=50)
                                    ],
                                   ['imp_pos_0_10',
                                    'imp_pos_11_20',
                                    'imp_pos_21_30',
                                    'imp_pos_31_40',
                                    'imp_pos_41_50'                                       
                                   ], 'ndefined'
                                   )
    
    temp2 = temp2.groupby(['imp_path','sku','imp_bucket'],as_index=False)['total_clicks_lp','total_impressions_lp'].sum()
    
    
    temp3= temp2.pivot(index=['imp_path','sku'],values='total_clicks_lp',columns='imp_bucket')
    temp3.rename({'imp_pos_0_10':'imp_pos_0_10_clicks','imp_pos_11_20':'imp_pos_11_20_clicks',
                 'imp_pos_21_30':'imp_pos_21_30_clicks','imp_pos_31_40':'imp_pos_31_40_clicks','imp_pos_41_50':'imp_pos_41_50_clicks'},axis=1,inplace=True)
    
    temp4 = temp2.pivot(index=['imp_path','sku'],values='total_impressions_lp',columns='imp_bucket')
    temp3.rename({'imp_pos_0_10':'imp_pos_0_10_imps','imp_pos_11_20':'imp_pos_11_20_imps',
                 'imp_pos_21_30':'imp_pos_21_30_imps','imp_pos_31_40':'imp_pos_31_40_imps','imp_pos_41_50':'imp_pos_41_50_imps'},axis=1,inplace=True)
    
    df2 = df.merge(temp,on=['imp_path','sku'],how='left')
    df2 = df2.merge(temp3,on=['imp_path','sku'],how='left')
    df2 = df2.merge(temp4,on=['imp_path','sku'],how='left')

    #levels at which to calculate the CTR usning above function
    levels = [ #CTR variables at 3 levels
                 ['imp_path', 'brand_clean', 'department'],
                 ['imp_path', 'brand_clean', 'category_clean'],
                 ['imp_path', 'brand_clean', 'subcategory_clean'],
                 ['imp_path', 'brand_clean', 'gender_clean'],
                 ['imp_path', 'department', 'category_clean'],
                 ['imp_path', 'department', 'subcategory_clean'],
                 ['imp_path', 'department', 'gender_clean'],
                 ['imp_path', 'category_clean', 'subcategory_clean'],
                 ['imp_path', 'category_clean', 'gender_clean'],
                 ['imp_path', 'subcategory_clean', 'gender_clean'],
                 ['brand_clean', 'department', 'category_clean'],
                 ['brand_clean', 'department', 'subcategory_clean'],
                 ['department', 'category_clean', 'subcategory_clean'],

                #CTR variables at 2 levels
                  ['imp_path','brand_clean'],['imp_path','department'],
                  ['imp_path','category_clean'],['imp_path','subcategory_clean'],
                  ['imp_path','gender_clean'],['imp_path','color'],
                  ['imp_path','occasion'],['imp_path','inventory_group_clean'],
                  ['imp_path','basic_type'],['imp_path','specialist'],
                  ['imp_path','merch_team_clean'],['imp_path','time_since_launch_bucket'],
                  ['imp_path','time_since_relaunch_bucket'],['imp_path','discount_bucket'],
                  ['brand_clean', 'department'],['brand_clean', 'category_clean'],
                  ['brand_clean', 'subcategory_clean'],['brand_clean', 'gender_clean'],
                  ['department', 'category_clean'],['department', 'subcategory_clean'],
                  ['department', 'gender_clean'],['category_clean', 'subcategory_clean'],
                  ['category_clean', 'gender_clean'],['subcategory_clean', 'gender_clean'],
                  ['time_since_launch_bucket','brand_clean'],['time_since_launch_bucket','category_clean'],
                  ['time_since_launch_bucket','subcategory_clean'],['time_since_launch_bucket','gender_clean'],
                  ['discount_bucket','brand_clean'],['discount_bucket','category_clean'],
                  ['discount_bucket','subcategory_clean'],['discount_bucket','gender_clean'],

              #CTR variables at 1 level
                 ['brand_clean'],['department'],['category_clean'],['subcategory_clean'],
                 ['gender_clean'],['color'],['occasion'],['inventory_group_clean'],
                 ['basic_type'],['specialist'],['merch_team_clean'],['time_since_launch_bucket'],
                 ['time_since_relaunch_bucket'],['discount_bucket']]


    names = ['imp_brand_dept','imp_brand_cat', 'imp_brand_subcat', 'imp_brand_gender','imp_dept_cat','imp_dept_subcat','imp_dept_gender','imp_cat_subcat','imp_cat_gender','imp_subcat_gender',
             'brand_dept_cat','brand_dept_subcat','brand_cat_subcat',

             'imp_brand','imp_dept','imp_cat','imp_subcat','imp_gender','imp_color','imp_occasion','imp_inv_group','imp_basic_type','imp_specialist',
            'imp_merch_team_clean','imp_time_since_launch_bucket','imp_time_since_relaunch_bucket','imp_discount_bucket','brand_dept','brand_cat','brand_subcat','brand_gender',
             'dept_cat','dept_subcat','dept_gender','cat_subcat','cat_gender','subcat_gender','time_launch_brand','time_launch_cat','time_launch_subcat','time_launch_gender',
             'disc_bucket_brand','disc_bucket_cat','disc_bucket_subcat','disc_bucket_gender',

             'brand','dept','cat','subcat','gender','color','occasion','inv_group','basic_type','specialist','merch_team_clean','time_since_launch_bucket',
             'time_since_relaunch_bucket','discount_bucket']

    for cols,name in zip(levels,names):
        df2 = calc_ctr(df2,cols,name)
    df2[['total_impressions_lp','total_clicks_lp']] = df2[['total_impressions_lp','total_clicks_lp']].fillna(0)
    df2[['total_impressions_lp','total_clicks_lp']] = df2[['total_impressions_lp','total_clicks_lp']].astype(int)
    df2['ctr_lp'] = np.round(np.where(df2['total_impressions_lp']==0,0,df2['total_clicks_lp']/df2['total_impressions_lp'])*100,2)
        
    return df2

In [13]:
#Inputs
df_query3 =  pd.read_pickle('drive/MyDrive/Training Data/stage 1/df_query3.pkl')
df = pd.read_csv('drive/MyDrive/Training Data/stage 3/relevance_feature_added_ts.csv')

#Adding query similarity based features
df2 = query_product_similarity_based_features(df)

df3 = historical_performance_features(df_query3, df2)

#Outputs
df3.to_csv('drive/MyDrive/Training Data/stage 4/final_training_dataset.csv')