<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/infocepts_data_engg_hackathon/notebooks/03_tasks_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Version 1](https://github.com/stiwari-ds/data-science-competitions/blob/main/machinehack/infocepts_data_engg_hackathon/notebooks/02_tasks_v1.ipynb)

# Setup

In [1]:
import re
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 10)

In [2]:
data_path = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/infocepts_data_engg_hackathon/data'

In [3]:
df = pd.read_csv(f'{data_path}/processed/data.csv')

In [4]:
df.head()

Unnamed: 0,start_time_ts,total_duration,total_clicks,total_items,total_cats,day_of_week,is_special_day,prod_views_freqs,prod_buys_freqs
0,"October 07, 2019 Monday, 00:08:28",166.7111748417,7,2,1,1,0,0.0009451492,0.0006900279
1,"October 01, 2019 Tuesday, 15:08:40",0.4341847739,7,2,1,0,0,0.0009458164,0.0006896519
2,"October 14, 2019 Monday, 21:17:24",0.9353204474,2,22,1,4,0,0.0009453911,0.0006900878
3,"October 05, 2019 Saturday, 21:54:13",0.4170550607,2,2,1,0,0,0.000944991,0.0007388218
4,"October 09, 2019 Wednesday, 13:33:22",2.4857195833,5,1,1,5,0,0.000945052,0.0006895188


# Task 1
**Same as version 1**

In [5]:
re.split(r'[ ,:]', df.start_time_ts[0])

['October', '07', '', '2019', 'Monday', '', '00', '08', '28']

In [6]:
def allot_points(record):
    """Extracts hour of day from date string
    and allot time_spec_points based on:
    1. points = hour of day, if before 4 pm
    2. points = hour of day + 5, if after 4 pm
    """
    elements = re.split(r'[ ,:]', record)
    hour = int(elements[6])
    
    if hour < 16: #4 pm -> 16:00 hours
        points = hour
    else:
        points = hour + 5

    return points

In [7]:
df['time_spec_points'] = df['start_time_ts'].apply(allot_points)

# Task 2
**Same as version 1**

In [8]:
df['total_duration'] = np.round(df['total_duration'], 2)

# Task 3
**Same as version 1**

In [9]:
df['total_inventory'] = df['total_clicks'] + df['total_items'] + df['total_cats']

If value > 10, increase by 100% i.e., double the value.

In [10]:
df['total_inventory'] = df['total_inventory'].apply(lambda x: x * 2 if x > 10 else x)

# Task 4
**Same as version 1**

In [11]:
df['give_big_discount'] = ((df['is_special_day'] == 1) & (df['day_of_week'] == 0)).astype('int')

# Task 5
**Same as version 1**

In [12]:
df['prod_views_buys_ratio'] = df['prod_views_freqs'] / df['prod_buys_freqs']

# Task 6
**Alternate approach compared to version 1**

Loyalty points are based on whether user is browsing on a Sunday. Two interpretations for day of the week:
* column 'day_of_week' from which 0 can be mapped to Sunday
* extracting day from 'start_time_ts' string and using it  

These two methods give different values.  
In version 1 of the notebook we tried the second approach. **So here we will go with the first approach.**

In [13]:
df['loyalty_points'] = (df['day_of_week'] == 0) * (df['total_duration'])

In [14]:
df['loyalty_points'] = df['loyalty_points'].apply(lambda x: (x-3) * 10 if x > 3 else 0)

In [21]:
df.loc[df.day_of_week == 0][['total_duration', 'loyalty_points']].sample(5, random_state=2)

Unnamed: 0,total_duration,loyalty_points
295782,10.8,78.0
272017,0.37,0.0
319606,8.14,51.4
484697,2.48,0.0
181623,0.53,0.0


In [24]:
df.loc[df.day_of_week != 0][['total_duration', 'loyalty_points']].sample(5, random_state=1)

Unnamed: 0,total_duration,loyalty_points
386692,0.74,0.0
141661,7.45,0.0
10807,15.73,0.0
496631,0.04,0.0
288428,0.06,0.0


# Submission file

In [25]:
sub_columns = [
               'total_duration', 
               'is_special_day', 
               'prod_views_freqs', 
               'prod_buys_freqs', 
               'give_big_discount', 
               'prod_views_buys_ratio', 
               'loyalty_points', 
               'total_inventory', 
               'time_spec_points'
              ]

sub_df = df[sub_columns]

In [26]:
sub_df.head()

Unnamed: 0,total_duration,is_special_day,prod_views_freqs,prod_buys_freqs,give_big_discount,prod_views_buys_ratio,loyalty_points,total_inventory,time_spec_points
0,166.71,0,0.0009451492,0.0006900279,0,1.3697259554,0.0,10,0
1,0.43,0,0.0009458164,0.0006896519,0,1.3714402807,0.0,10,15
2,0.94,0,0.0009453911,0.0006900878,0,1.3699576946,0.0,50,26
3,0.42,0,0.000944991,0.0007388218,0,1.2790512833,0.0,5,26
4,2.49,0,0.000945052,0.0006895188,0,1.3705964181,0.0,7,13


In [27]:
sub_df.to_csv('03_sub.csv', index=False)

In [28]:
!head 03_sub.csv

total_duration,is_special_day,prod_views_freqs,prod_buys_freqs,give_big_discount,prod_views_buys_ratio,loyalty_points,total_inventory,time_spec_points
166.71,0,0.0009451491729694,0.0006900279353354,0,1.3697259553846814,0.0,10,0
0.43,0,0.0009458164456021,0.0006896519366427,0,1.3714402807399286,0.0,10,15
0.94,0,0.0009453910736723,0.0006900877869533,0,1.3699576945799172,0.0,50,26
0.42,0,0.0009449909696147,0.0007388217985755,0,1.2790512833225935,0.0,5,26
2.49,0,0.0009450520425059,0.0006895188328313,0,1.3705964181215042,0.0,7,13
0.44,0,0.0009453624766345,0.000689549194347,0,1.3709862681077514,0.0,7,13
0.61,0,0.0009611731128219,0.0006900535468109,0,1.3928964168997986,0.0,10,27
0.02,0,0.0009453409681643,0.0006894011723935,0,1.3712494350455111,0.0,32,7
2.62,0,0.0009455220279722,0.0007366282136403,0,1.2835810663558214,0.0,7,9
