#### Import libraries

In [1]:
import pandas as pd 
import numpy as np 
from datetime import datetime 
import xmltodict 
import matplotlib.pyplot as plt 
%matplotlib inline 

#### Load file 

In [2]:
data = open('infocept_xml_data_.xml', 'r').read()
data_dict = xmltodict.parse(data)
data_dict = data_dict['data']['row']
data_df = pd.DataFrame.from_dict(data_dict)

#### Brief evaluation of data 

In [3]:
data_df.head()

Unnamed: 0,index,user_session,total_clicks,total_items,total_cats,max_dwell,mean_dwell,total_duration,click_rate,day_of_week,...,cat_most_viewed,prod_most_viewed_n_times,prod_most_viewed,cat_views_freqs,prod_views_freqs,cat_buys_freqs,prod_buys_freqs,is_purchase,start_time_ts,end_time_ts
0,0,8b69208e-7c64-422d-b670-5b9d82d25606,7,2,1,0.5115078598666931,5.995877926032465,166.71117484170466,0.0725244742626448,1,...,2053013560982667520,1.0,1030297,0.2678021521567366,0.0009451491729694,0.2337402332570538,0.0006900279353354,0,"October 07, 2019 Monday, 00:08:28","October 26, 2019 Saturday, 16:08:29"
1,1,8068596c-a2b8-46ca-8791-e10d70520a1f,7,2,1,7.626575685782095,3.5291647514620803,0.4341847739325721,0.2042478218468011,0,...,2053013602328876544,1.0,36083228,0.1409251198603834,0.0009458164456021,0.4550292185895114,0.0006896519366427,0,"October 01, 2019 Tuesday, 15:08:40","October 15, 2019 Tuesday, 20:03:18"
2,2,4d153217-e06d-4581-8924-702c7f757610,2,22,1,3.0956374607750474,180.97751575631065,0.935320447421298,0.0459822489852758,4,...,2175094838038844672,2.0,1002263,0.0002938263260135,0.0009453910736723,9.102016583495409e-08,0.0006900877869533,0,"October 14, 2019 Monday, 21:17:24","October 23, 2019 Wednesday, 21:56:30"
3,3,e65a6869-fb45-4f34-8482-c80278676d0f,2,2,1,1.0891282494607244,4.33828036473822,0.4170550606952284,0.0284757557656416,0,...,2053013568140556288,3.0,1003257,0.1501068135750563,0.0009449909696147,0.4550279409441771,0.0007388217985755,0,"October 05, 2019 Saturday, 21:54:13","October 17, 2019 Thursday, 13:54:29"
4,4,d8206742-06d7-42a2-8163-d70f9dd6cb18,5,1,1,0.3298140458866487,3.1808911181790744,2.485719583259645,0.0948123574889597,5,...,2053013558410492160,1.0,13728056,0.2677838216567613,0.0009450520425059,0.4550284240027174,0.0006895188328313,0,"October 09, 2019 Wednesday, 13:33:22","October 29, 2019 Tuesday, 11:51:04"


In [4]:
data_df.dtypes 

index                       object
user_session                object
total_clicks                object
total_items                 object
total_cats                  object
max_dwell                   object
mean_dwell                  object
total_duration              object
click_rate                  object
day_of_week                 object
is_weekend                  object
is_special_day              object
time_of_day                 object
cat_most_viewed_n_times     object
cat_most_viewed             object
prod_most_viewed_n_times    object
prod_most_viewed            object
cat_views_freqs             object
prod_views_freqs            object
cat_buys_freqs              object
prod_buys_freqs             object
is_purchase                 object
start_time_ts               object
end_time_ts                 object
dtype: object

In [5]:
data_df.isna().sum()

index                       0
user_session                0
total_clicks                0
total_items                 0
total_cats                  0
max_dwell                   0
mean_dwell                  0
total_duration              0
click_rate                  0
day_of_week                 0
is_weekend                  0
is_special_day              0
time_of_day                 0
cat_most_viewed_n_times     0
cat_most_viewed             0
prod_most_viewed_n_times    0
prod_most_viewed            0
cat_views_freqs             0
prod_views_freqs            0
cat_buys_freqs              0
prod_buys_freqs             0
is_purchase                 0
start_time_ts               0
end_time_ts                 0
dtype: int64

#### Function to calculate "time_spec_points"

In [8]:
def cal_time_spec_points(x):
    points = x.hour
    points = points + 5 if ((x.hour >= 16) & (x.minute > 0) & (x.second > 0 )) else points
    
    return points 

#### Data transformations 

In [9]:
data_df['start_time_ts'] = pd.to_datetime(data_df['start_time_ts'], format='%B %d, %Y %A, %H:%M:%S')
data_df['time_spec_points'] = data_df['start_time_ts'].apply(lambda x: cal_time_spec_points(x))
data_df['total_duration_copy'] = data_df['total_duration'].astype(float).copy()
data_df['total_duration'] = data_df['total_duration'].astype(float).round(2)
data_df['total_inventory'] = data_df['total_clicks'].astype(int) + data_df['total_items'].astype(int) + data_df['total_cats'].astype(int)
data_df['total_inventory'] =  np.where(data_df['total_inventory'] > 10, data_df['total_inventory']*2 ,data_df['total_inventory'])
data_df['give_big_discount'] = np.where(((data_df['day_of_week'] == '0') & (data_df['is_special_day'] == '1')), 1, 0)
data_df['prod_views_buys_ratio'] = data_df['prod_views_freqs'].astype(float) / data_df['prod_buys_freqs'].astype(float)
data_df['loyalty_points'] = np.where(((data_df['day_of_week'] == '0') & (data_df['total_duration_copy'] > 3)), 10*(data_df['total_duration_copy'] - 3), 0)

#### Write to output file 

In [None]:
submission_df = pd.read_csv('infocept_submission.csv')
final_submission = data_df[submission_df.columns]
final_submission.to_csv('infocepts_transformed_submission.csv', index=False)