<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/infocepts_data_engg_hackathon/notebooks/02_tasks_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [64]:
import re
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 10)

In [65]:
data_path = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/infocepts_data_engg_hackathon/data'

In [66]:
df = pd.read_csv(f'{data_path}/processed/data.csv')

In [67]:
df.head()

Unnamed: 0,start_time_ts,total_duration,total_clicks,total_items,total_cats,day_of_week,is_special_day,prod_views_freqs,prod_buys_freqs
0,"October 07, 2019 Monday, 00:08:28",166.7111748417,7,2,1,1,0,0.0009451492,0.0006900279
1,"October 01, 2019 Tuesday, 15:08:40",0.4341847739,7,2,1,0,0,0.0009458164,0.0006896519
2,"October 14, 2019 Monday, 21:17:24",0.9353204474,2,22,1,4,0,0.0009453911,0.0006900878
3,"October 05, 2019 Saturday, 21:54:13",0.4170550607,2,2,1,0,0,0.000944991,0.0007388218
4,"October 09, 2019 Wednesday, 13:33:22",2.4857195833,5,1,1,5,0,0.000945052,0.0006895188


# Task 1

In [68]:
re.split(r'[ ,:]', df.start_time_ts[0])

['October', '07', '', '2019', 'Monday', '', '00', '08', '28']

In [69]:
def allot_points(record):
    """Extracts hour of day from date string
    and allot time_spec_points based on:
    1. points = hour of day, if before 4 pm
    2. points = hour of day + 5, if after 4 pm
    """
    elements = re.split(r'[ ,:]', record)
    hour = int(elements[6])
    
    if hour < 16: #4 pm -> 16:00 hours
        points = hour
    else:
        points = hour + 5

    return points

In [70]:
df['time_spec_points'] = df['start_time_ts'].apply(allot_points)

In [71]:
df[['start_time_ts', 'time_spec_points']].sample(5, random_state=1)

Unnamed: 0,start_time_ts,time_spec_points
352806,"October 01, 2019 Tuesday, 17:54:30",22
417824,"October 09, 2019 Wednesday, 03:22:18",3
469847,"October 09, 2019 Wednesday, 15:18:10",15
407746,"October 17, 2019 Thursday, 23:44:08",28
469848,"October 07, 2019 Monday, 22:47:06",27


# Task 2

In [72]:
df['total_duration'] = np.round(df['total_duration'], 2)

In [73]:
df['total_duration'].head()

0    166.71
1      0.43
2      0.94
3      0.42
4      2.49
Name: total_duration, dtype: float64

# Task 3

In [74]:
df['total_inventory'] = df['total_clicks'] + df['total_items'] + df['total_cats']

In [75]:
df[['total_clicks', 'total_items', 'total_cats', 'total_inventory']].sample(5, random_state=1)

Unnamed: 0,total_clicks,total_items,total_cats,total_inventory
352806,2,2,1,5
417824,2,3,6,11
469847,8,2,1,11
407746,5,3,1,9
469848,4,1,1,6


If value > 10, increase by 100% i.e., double the value.

In [76]:
df['total_inventory'] = df['total_inventory'].apply(lambda x: x * 2 if x > 10 else x)

In [77]:
df[['total_clicks', 'total_items', 'total_cats', 'total_inventory']].sample(5, random_state=1)

Unnamed: 0,total_clicks,total_items,total_cats,total_inventory
352806,2,2,1,5
417824,2,3,6,22
469847,8,2,1,22
407746,5,3,1,9
469848,4,1,1,6


# Task 4

In [78]:
df['give_big_discount'] = ((df['is_special_day'] == 1) & (df['day_of_week'] == 0)).astype('int')

In [79]:
df[['is_special_day', 'day_of_week', 'give_big_discount']].sample(10, random_state=0)

Unnamed: 0,is_special_day,day_of_week,give_big_discount
194257,0,5,0
396319,0,5,0
138738,0,5,0
66570,0,5,0
149424,0,0,0
128165,0,5,0
264495,0,0,0
30371,0,2,0
456998,0,1,0
32001,1,0,1


# Task 5

In [80]:
df['prod_views_buys_ratio'] = df['prod_views_freqs'] / df['prod_buys_freqs']

In [81]:
df[['prod_views_freqs', 'prod_buys_freqs', 'prod_views_buys_ratio']].sample(5, random_state=0)

Unnamed: 0,prod_views_freqs,prod_buys_freqs,prod_views_buys_ratio
194257,0.0009454488,0.0007392539,1.2789229189
396319,0.0009610268,0.0006896054,1.3935894819
138738,0.000945159,0.0006897377,1.3703166397
66570,0.0009462386,0.0007398585,1.2789453165
149424,0.0009459357,0.0007389214,1.28015735


# Task 6

Loyalty points are based on whether user is browsing on a Sunday. Two interpretations for day of the week:
* column 'day_of_week' from which 0 can be mapped to Sunday
* extracting day from 'start_time_ts' string and using it  

These two methods give different values.  
For task-4, column 'day_of_week' was explicitly mentioned and required to be equal to 0. For task-6, the day is mentioned as being 'Sunday'. **So we will go with the second approach.**

In [82]:
def get_day(record):
    """Extracts day from start_time_ts and
    returns it as a string.
    """
    elements = re.split(r'[ ,:]', record)
    day = elements[4].strip()
    return day

In [83]:
df['day_from_timestamp'] = df['start_time_ts'].apply(get_day).astype('string')

In [84]:
df[['start_time_ts', 'day_from_timestamp']].sample(5, random_state=1)

Unnamed: 0,start_time_ts,day_from_timestamp
352806,"October 01, 2019 Tuesday, 17:54:30",Tuesday
417824,"October 09, 2019 Wednesday, 03:22:18",Wednesday
469847,"October 09, 2019 Wednesday, 15:18:10",Wednesday
407746,"October 17, 2019 Thursday, 23:44:08",Thursday
469848,"October 07, 2019 Monday, 22:47:06",Monday


In [85]:
df['day_from_timestamp'].unique()

<StringArray>
['Monday', 'Tuesday', 'Saturday', 'Wednesday', 'Friday', 'Thursday', 'Sunday']
Length: 7, dtype: string

In [86]:
df['loyalty_points'] = (df['day_from_timestamp'] == 'Sunday') * (df['total_duration'])

In [87]:
df['loyalty_points'] = df['loyalty_points'].apply(lambda x: (x-3) * 10 if x > 3 else 0)

In [88]:
df.loc[df.day_from_timestamp == 'Sunday'][['total_duration', 'loyalty_points']].sample(10, random_state=1)

Unnamed: 0,total_duration,loyalty_points
423330,0.13,0.0
306149,23.62,206.2
60846,2.73,0.0
156788,1.16,0.0
218217,0.02,0.0
355565,55.43,524.3
169773,4.27,12.7
301288,0.05,0.0
51077,6.29,32.9
298316,0.51,0.0


In [89]:
df.loc[df.day_from_timestamp != 'Sunday'][['total_duration', 'loyalty_points']].sample(5, random_state=1)

Unnamed: 0,total_duration,loyalty_points
117689,14.15,0.0
363187,1.94,0.0
71439,7.5,0.0
272814,16.36,0.0
229359,3.74,0.0


# Submission file

In [90]:
sub_columns = [
               'total_duration', 
               'is_special_day', 
               'prod_views_freqs', 
               'prod_buys_freqs', 
               'give_big_discount', 
               'prod_views_buys_ratio', 
               'loyalty_points', 
               'total_inventory', 
               'time_spec_points'
              ]

sub_df = df[sub_columns]

In [91]:
sub_df.head()

Unnamed: 0,total_duration,is_special_day,prod_views_freqs,prod_buys_freqs,give_big_discount,prod_views_buys_ratio,loyalty_points,total_inventory,time_spec_points
0,166.71,0,0.0009451492,0.0006900279,0,1.3697259554,0.0,10,0
1,0.43,0,0.0009458164,0.0006896519,0,1.3714402807,0.0,10,15
2,0.94,0,0.0009453911,0.0006900878,0,1.3699576946,0.0,50,26
3,0.42,0,0.000944991,0.0007388218,0,1.2790512833,0.0,5,26
4,2.49,0,0.000945052,0.0006895188,0,1.3705964181,0.0,7,13


In [92]:
sub_df.to_csv('02_sub.csv', index=False)

In [93]:
!head 02_sub.csv

total_duration,is_special_day,prod_views_freqs,prod_buys_freqs,give_big_discount,prod_views_buys_ratio,loyalty_points,total_inventory,time_spec_points
166.71,0,0.0009451491729694,0.0006900279353354,0,1.3697259553846814,0.0,10,0
0.43,0,0.0009458164456021,0.0006896519366427,0,1.3714402807399286,0.0,10,15
0.94,0,0.0009453910736723,0.0006900877869533,0,1.3699576945799172,0.0,50,26
0.42,0,0.0009449909696147,0.0007388217985755,0,1.2790512833225935,0.0,5,26
2.49,0,0.0009450520425059,0.0006895188328313,0,1.3705964181215042,0.0,7,13
0.44,0,0.0009453624766345,0.000689549194347,0,1.3709862681077514,0.0,7,13
0.61,0,0.0009611731128219,0.0006900535468109,0,1.3928964168997986,0.0,10,27
0.02,0,0.0009453409681643,0.0006894011723935,0,1.3712494350455111,0.0,32,7
2.62,0,0.0009455220279722,0.0007366282136403,0,1.2835810663558214,0.0,7,9
