#### Importing libraries and data

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 

In [2]:
# importing website traffic data
path = r'C:\Users\Richárd\Desktop\CareerFoundry\Data Immersion\Achievement 6\project\Data'
data = pd.read_csv(os.path.join(path, 'tokenized_access_logs.csv'))

#### Data Cleaning / Preprocessing

In [3]:
data.shape

(469977, 8)

In [4]:
data.head()

Unnamed: 0,Product,Category,Date,Month,Hour,Department,ip,url
0,adidas Brazuca 2017 Official Match Ball,baseball & softball,9/1/2017 6:00,Sep,6,fitness,37.97.182.65,/department/fitness/category/baseball%20&%20so...
1,The North Face Women's Recon Backpack,hunting & shooting,9/1/2017 6:00,Sep,6,fan shop,206.56.112.1,/department/fan%20shop/category/hunting%20&%20...
2,adidas Kids' RG III Mid Football Cleat,featured shops,9/1/2017 6:00,Sep,6,apparel,215.143.180.0,/department/apparel/category/featured%20shops/...
3,Under Armour Men's Compression EV SL Slide,electronics,9/1/2017 6:00,Sep,6,footwear,206.56.112.1,/department/footwear/category/electronics/prod...
4,Pelican Sunstream 100 Kayak,water sports,9/1/2017 6:01,Sep,6,fan shop,136.108.56.242,/department/fan%20shop/category/water%20sports...


In [5]:
data.dtypes

Product       object
Category      object
Date          object
Month         object
Hour           int64
Department    object
ip            object
url           object
dtype: object

In [6]:
# chcking for null values
data.isnull().sum()

Product       0
Category      0
Date          0
Month         0
Hour          0
Department    0
ip            0
url           0
dtype: int64

In [7]:
# checking for duplicates
dups = data[data.duplicated()]


In [8]:
dups.shape

(3249, 8)

In [9]:
3249 / 469977

0.0069131042582934905

- Even though there are duplicates, upon a closer look it is wisible that they might be still valid clicks, I see no reason to remove them

In [10]:
# smoothing dates to daily basis
import datetime
data['Day'] = data['Date'].apply(lambda x: pd.to_datetime(x))

In [11]:
data['Day'] = data['Day'].apply(lambda x: x.date())

In [85]:
data.head()

Unnamed: 0,Product,Category,Date,Month,Hour,Department,ip,url,Day
0,adidas Brazuca 2017 Official Match Ball,baseball & softball,9/1/2017 6:00,Sep,6,fitness,37.97.182.65,/department/fitness/category/baseball%20&%20so...,2017-09-01
1,The North Face Women's Recon Backpack,hunting & shooting,9/1/2017 6:00,Sep,6,fan shop,206.56.112.1,/department/fan%20shop/category/hunting%20&%20...,2017-09-01
2,adidas Kids' RG III Mid Football Cleat,featured shops,9/1/2017 6:00,Sep,6,apparel,215.143.180.0,/department/apparel/category/featured%20shops/...,2017-09-01
3,Under Armour Men's Compression EV SL Slide,electronics,9/1/2017 6:00,Sep,6,footwear,206.56.112.1,/department/footwear/category/electronics/prod...,2017-09-01
4,Pelican Sunstream 100 Kayak,water sports,9/1/2017 6:01,Sep,6,fan shop,136.108.56.242,/department/fan%20shop/category/water%20sports...,2017-09-01


In [86]:
# Creating a subset to work with
time_series = data[['Day', 'Category', 'ip']]

In [87]:
# setting up for plotting data
to_plot = time_series.groupby(['Day', 'Category']).agg('count')

In [88]:
to_plot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ip
Day,Category,Unnamed: 2_level_1
2017-09-01,accessories,50
2017-09-01,as seen on tv!,77
2017-09-01,baseball & softball,78
2017-09-01,basketball,80
2017-09-01,boxing & mma,62


In [89]:
# plotting the data
fig = px.line(to_plot.reset_index(), x = 'Day', y = 'ip', color = 'Category')
fig.show()

 - Looks like something strange was going on on 14 Sep 2017 - could be an attack on the site?

In [90]:
# checking if many clicks on 14SEP2017 came from the same ip
from datetime import datetime
data[data['Day'] == datetime.strptime('09-14-2017', '%m-%d-%Y').date()].value_counts('ip').max()
# does not seem so

38