<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/infocepts_data_engg_hackathon/notebooks/04_tasks_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Version 1](https://github.com/stiwari-ds/data-science-competitions/blob/main/machinehack/infocepts_data_engg_hackathon/notebooks/02_tasks_v1.ipynb)

# [Version 2](https://github.com/stiwari-ds/data-science-competitions/blob/main/machinehack/infocepts_data_engg_hackathon/notebooks/03_tasks_v2.ipynb)

# Setup

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 10)

In [2]:
data_path = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/infocepts_data_engg_hackathon/data'

In [3]:
df = pd.read_csv(f'{data_path}/processed/data.csv')

In [4]:
df.head()

Unnamed: 0,start_time_ts,total_duration,total_clicks,total_items,total_cats,day_of_week,is_special_day,prod_views_freqs,prod_buys_freqs
0,"October 07, 2019 Monday, 00:08:28",166.7111748417,7,2,1,1,0,0.0009451492,0.0006900279
1,"October 01, 2019 Tuesday, 15:08:40",0.4341847739,7,2,1,0,0,0.0009458164,0.0006896519
2,"October 14, 2019 Monday, 21:17:24",0.9353204474,2,22,1,4,0,0.0009453911,0.0006900878
3,"October 05, 2019 Saturday, 21:54:13",0.4170550607,2,2,1,0,0,0.000944991,0.0007388218
4,"October 09, 2019 Wednesday, 13:33:22",2.4857195833,5,1,1,5,0,0.000945052,0.0006895188


# Task 1
**Same approach as version 1 and 2**  
Modifications:
* added check for time exactly equal to 4 pm
* using pandas in-built date processing instead of regex.

In [5]:
df['start_time_ts'] = pd.to_datetime(df['start_time_ts'])

In [6]:
df['hour'] = df['start_time_ts'].dt.hour
df['minute'] = df['start_time_ts'].dt.minute
df['second'] = df['start_time_ts'].dt.second

In [7]:
len(df.loc[(df.hour == 16) & (df.minute == 0) & (df.second == 0)]) #exactly 4 pm

0

In [8]:
len(df.loc[(df.hour == 16) & (df.minute == 0) & (df.second > 0)]) #first minute after 4 pm

325

In [9]:
def allot_points(record):
    """Extracts hour of day from date string
    and allot time_spec_points based on:
    1. points = hour of day, if before 4 pm
    2. points = hour of day + 5, if after 4 pm
    """
    hour = record.hour
    if hour < 16: #4 pm -> 16:00 hours
        return hour
    else:
        return (hour + 5)

In [10]:
df['time_spec_points'] = df['start_time_ts'].apply(allot_points)

# Task 6
**Same approach as version 2, but performed before rounding off total_duration (Task-2)**

In [11]:
df['loyalty_points'] = (df['day_of_week'] == 0) * (df['total_duration'])

In [12]:
df['loyalty_points'] = df['loyalty_points'].apply(lambda x: (x-3) * 10 if x > 3 else 0)

In [13]:
df.loc[df.day_of_week == 0][['total_duration', 'loyalty_points']].sample(5, random_state=2)

Unnamed: 0,total_duration,loyalty_points
295782,10.8028419406,78.0284194057
272017,0.3732031142,0.0
319606,8.1373858164,51.3738581642
484697,2.4827634629,0.0
181623,0.5334883236,0.0


In [14]:
df.loc[df.day_of_week != 0][['total_duration', 'loyalty_points']].sample(5, random_state=1)

Unnamed: 0,total_duration,loyalty_points
386692,0.7432265055,0.0
141661,7.4486855067,0.0
10807,15.7318145508,0.0
496631,0.0355527736,0.0
288428,0.0639463632,0.0


# Task 2
**Same as version 1 and 2**

In [15]:
df['total_duration'] = np.round(df['total_duration'], 2)

# Task 3
**Same as version 1 and 2**

In [16]:
df['total_inventory'] = df['total_clicks'] + df['total_items'] + df['total_cats']

If value > 10, increase by 100% i.e., double the value.

In [17]:
df['total_inventory'] = df['total_inventory'].apply(lambda x: x * 2 if x > 10 else x)

# Task 4
**Same as version 1 and 2**

In [18]:
df['give_big_discount'] = ((df['is_special_day'] == 1) & (df['day_of_week'] == 0)).astype('int')

# Task 5
**Same as version 1 and 2**

In [19]:
df['prod_views_buys_ratio'] = df['prod_views_freqs'] / df['prod_buys_freqs']

# Submission file

In [20]:
sub_columns = [
               'total_duration', 
               'is_special_day', 
               'prod_views_freqs', 
               'prod_buys_freqs', 
               'give_big_discount', 
               'prod_views_buys_ratio', 
               'loyalty_points', 
               'total_inventory', 
               'time_spec_points'
              ]

sub_df = df[sub_columns]

In [21]:
sub_df.head()

Unnamed: 0,total_duration,is_special_day,prod_views_freqs,prod_buys_freqs,give_big_discount,prod_views_buys_ratio,loyalty_points,total_inventory,time_spec_points
0,166.71,0,0.0009451492,0.0006900279,0,1.3697259554,0.0,10,0
1,0.43,0,0.0009458164,0.0006896519,0,1.3714402807,0.0,10,15
2,0.94,0,0.0009453911,0.0006900878,0,1.3699576946,0.0,50,26
3,0.42,0,0.000944991,0.0007388218,0,1.2790512833,0.0,5,26
4,2.49,0,0.000945052,0.0006895188,0,1.3705964181,0.0,7,13


In [22]:
sub_df.to_csv('04_sub.csv', index=False)

In [23]:
!head 04_sub.csv

total_duration,is_special_day,prod_views_freqs,prod_buys_freqs,give_big_discount,prod_views_buys_ratio,loyalty_points,total_inventory,time_spec_points
166.71,0,0.0009451491729694,0.0006900279353354,0,1.3697259553846814,0.0,10,0
0.43,0,0.0009458164456021,0.0006896519366427,0,1.3714402807399286,0.0,10,15
0.94,0,0.0009453910736723,0.0006900877869533,0,1.3699576945799172,0.0,50,26
0.42,0,0.0009449909696147,0.0007388217985755,0,1.2790512833225935,0.0,5,26
2.49,0,0.0009450520425059,0.0006895188328313,0,1.3705964181215042,0.0,7,13
0.44,0,0.0009453624766345,0.000689549194347,0,1.3709862681077514,0.0,7,13
0.61,0,0.0009611731128219,0.0006900535468109,0,1.3928964168997986,0.0,10,27
0.02,0,0.0009453409681643,0.0006894011723935,0,1.3712494350455111,0.0,32,7
2.62,0,0.0009455220279722,0.0007366282136403,0,1.2835810663558214,0.0,7,9
