In [5]:
import pandas as pd
import numpy as np
import pickle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
def import_and_clean_feature_set(df_feature_set):
    '''
    Objective:
    Importing the product features to be used as independent variables in the model
    
    Input parameter:
    query to be used for data import
    
    Output:
    Cleaned feature set
    
    '''
    #columns whose values can be replaced with zeros
    zero_fill_cols = ['ttl_clicks_AE','ttl_impressions_AE','click_through_rate','discount_pct','time_since_launch','time_since_relaunch']
    df_feature_set[zero_fill_cols] = df_feature_set[zero_fill_cols].fillna(0)
    
    df_feature_set['time_since_launch'] = df_feature_set['time_since_launch'].astype(int)
    df_feature_set['time_since_relaunch'] = df_feature_set['time_since_relaunch'].astype(int)

    #when there was click, we assume the availability was 100%, else 0%
    df_feature_set['availability_dxb_pct'] = np.where((df_feature_set['ttl_clicks_AE']==0) & (pd.isna(df_feature_set['availability_dxb_pct'])), 0,df_feature_set['availability_dxb_pct']  ) 
    df_feature_set['availability_dxb_pct'] = np.where((df_feature_set['ttl_clicks_AE']>0) & (pd.isna(df_feature_set['availability_dxb_pct'])),100,df_feature_set['availability_dxb_pct'])


    #relaunch flag
    df_feature_set['relaunch_flag'] = np.where(pd.isna(df_feature_set['relaunched_at']),0,1)
    df_feature_set['launched_last30days'] = np.where(df_feature_set['time_since_launch']<=30,1,0)
    df_feature_set['launched_last7days'] = np.where(df_feature_set['time_since_launch']<=7,1,0)

    df_feature_set['time_since_launch_bucket'] = np.select([df_feature_set['time_since_launch']<=7,
                                                (df_feature_set['time_since_launch']>7) & (df_feature_set['time_since_launch']<=14),
                                                (df_feature_set['time_since_launch']>14) & (df_feature_set['time_since_launch']<=30),
                                                (df_feature_set['time_since_launch']>30)],
                                                ['0-7','7-14','14-30','30+'],
                                               '30+')


    #if the product has never been relaunched, then time_since_relaunch will be equal to time since launch
    df_feature_set['time_since_relaunch'] = np.where(pd.isna(df_feature_set['time_since_relaunch']),df_feature_set['time_since_launch'],df_feature_set['time_since_relaunch'])

    #creating time since re-launch bucket
    df_feature_set['time_since_relaunch_bucket'] = np.select([df_feature_set['time_since_relaunch']<=7,
                                                (df_feature_set['time_since_relaunch']>7) & (df_feature_set['time_since_relaunch']<=14),
                                                (df_feature_set['time_since_relaunch']>14) & (df_feature_set['time_since_relaunch']<=30),
                                                (df_feature_set['time_since_relaunch']>30)],
                                                ['0-7','7-14','14-30','30+'],
                                               '30+')

    #creating bucket for discount percentage
    buckets = [0,10,20,30,40,50,60,70,80,90,101]
    df_feature_set['discount_bucket'] = pd.cut(df_feature_set['discount_pct'],bins = buckets)
    df_feature_set['discount_bucket'] = df_feature_set['discount_bucket'].astype(str)


    #attaching not available to all cases where following colums are missing
    df_feature_set[['world_tag','upper_material','lining_material','sole_material','season_grouped_ty']] = df_feature_set[['world_tag','upper_material','lining_material','sole_material','season_grouped_ty']].fillna("NA")

    #Dropping unwanted columns
    to_be_dropped = ['season','season_number','relaunched_at','activated_at']
    df_feature_set.drop(to_be_dropped,axis=1,inplace=True)
    
    return df_feature_set


cleaned_features = import_and_clean_feature_set(pd.read_pickle('drive/MyDrive/Training Data/df_query2.pkl'))
cleaned_features.to_csv('drive/MyDrive/Training Data/cleaned_prod_feature_set.pkl')