In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import os 
import json
from pandas.io.json import json_normalize

In [2]:
os.chdir("GoogleAnalyticsSales/")

In [3]:
JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
# converter applies a converter function while you are loading the data, you are loading it as
# json.loadstring = json.loads() for each of those columns in the list
train_set = pd.read_csv("train.csv", 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'})

In [4]:
train_set.groupby("channelGrouping").size()

channelGrouping
(Other)              120
Affiliates         16403
Direct            143026
Display             6262
Organic Search    381561
Paid Search        25326
Referral          104838
Social            226117
dtype: int64

In [5]:
train_set.groupby("socialEngagementType").size()

socialEngagementType
Not Socially Engaged    903653
dtype: int64

In [6]:
train_set.groupby("visitNumber").size()

visitNumber
1      703060
2       92548
3       35843
4       19157
5       11615
6        7677
7        5413
8        4031
9        3084
10       2415
11       1936
12       1573
13       1292
14       1092
15        928
16        809
17        699
18        611
19        541
20        497
21        446
22        399
23        355
24        340
25        304
26        272
27        245
28        223
29        203
30        188
        ...  
358         1
359         1
360         1
361         1
362         1
363         1
364         1
369         1
370         1
371         1
372         1
373         1
374         1
375         1
376         1
377         1
378         1
379         1
383         1
384         1
385         1
386         1
387         1
388         1
389         1
390         1
391         1
393         1
394         1
395         1
Length: 384, dtype: int64

In [4]:
# drop it since it has just one value for all the columns
train_set.drop(["socialEngagementType"], axis=1, inplace=True)

In [5]:
# for each column, since you are already loaded it as JSON
for column in JSON_COLUMNS:
        # normalize them, that is whether all the rows has this variable or not, create a seperate column
        # for each of them and give NaN is that attribute is not present for the row
        column_as_df = json_normalize(list(train_set[column]))
        # convert each json key in that column to column.subcolumn, for example, device.isMobile.
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        # drop the main column, for example "device" and merge all the columns you just created like "device.browser"
        train_set = train_set.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
print(train_set.shape, "\n", train_set.columns)

(903653, 54) 
 Index(['channelGrouping', 'date', 'fullVisitorId', 'sessionId', 'visitId',
       'visitNumber', 'visitStartTime', 'device.browser', 'device.browserSize',
       'device.browserVersion', 'device.deviceCategory', 'device.flashVersion',
       'device.isMobile', 'device.language', 'device.mobileDeviceBranding',
       'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName',
       'device.mobileDeviceModel', 'device.mobileInputSelector',
       'device.operatingSystem', 'device.operatingSystemVersion',
       'device.screenColors', 'device.screenResolution', 'geoNetwork.city',
       'geoNetwork.cityId', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.metro',
       'geoNetwork.networkDomain', 'geoNetwork.networkLocation',
       'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces',
       'totals.hits', 'totals.newVisits', 'totals.pageviews',
       'totals.transactionRevenue', 'totals.visit

In [6]:
# go through all the columns in the train_set
for column in train_set.columns:
    # whichever column was a JSON and you split it
    if "." in column:
        print(train_set.groupby(column).size())
        # check how many different values you have in them, if its just one value throughout the dataset, drop it
        if (len(train_set.groupby(column).size()) == 1):
            train_set.drop(column, axis=1, inplace=True)

device.browser
(not set)                                           8
0                                                   7
ADM                                                 1
Amazon Silk                                       561
Android Browser                                   553
Android Runtime                                     2
Android Webview                                  7865
Apple-iPhone7C2                                     9
BlackBerry                                        184
CSM Click                                           1
Changa 99695759                                     1
Chrome                                         620364
Coc Coc                                           727
DASH_JR_3G                                          4
DoCoMo                                              1
Edge                                            10205
Firefox                                         37069
HTC802t_TD                                          1
Hisense M20-M

geoNetwork.latitude
not available in demo dataset    903653
dtype: int64
geoNetwork.longitude
not available in demo dataset    903653
dtype: int64
geoNetwork.metro
(not set)                                    201766
Abilene-Sweetwater TX                            10
Albany-Schenectady-Troy NY                       19
Atlanta GA                                     2463
Augusta GA                                        7
Austin TX                                      3790
Baltimore MD                                     32
Boise ID                                          3
Boston MA-Manchester NH                        2628
Butte-Bozeman MT                                  3
Central Scotland                                 65
Charleston SC                                    52
Charlotte NC                                    525
Charlottesville VA                               13
Chattanooga TN                                    6
Chicago IL                                     7585
Chic

geoNetwork.subContinent
(not set)               1468
Australasia            14893
Caribbean               2406
Central America        15583
Central Asia            1215
Eastern Africa          1927
Eastern Asia           46919
Eastern Europe         45249
Melanesia                 81
Micronesian Region        55
Middle Africa            393
Northern Africa         7683
Northern America      390657
Northern Europe        58168
Polynesia                 25
South America          41731
Southeast Asia         77800
Southern Africa         2169
Southern Asia          59321
Southern Europe        35780
Western Africa          2573
Western Asia           38443
Western Europe         59114
dtype: int64
totals.bounces
1    450630
dtype: int64
totals.hits
1      446754
10      10640
100        39
101        35
102        27
103        27
104        28
105        28
106        30
107        21
108        27
109        27
11       9264
110        28
111        21
112        20
113        16
114   

trafficSource.adwordsClickInfo.page
1     21362
14        1
2        73
3        10
4         2
5         7
7         3
9         2
dtype: int64
trafficSource.adwordsClickInfo.slot
RHS      504
Top    20956
dtype: int64
trafficSource.campaign
(not set)                                          865347
AW - Accessories                                     7070
AW - Apparel                                           46
AW - Dynamic Search Ads Whole Site                  14244
AW - Electronics                                       96
All Products                                            4
Data Share                                              1
Data Share Promo                                    16403
Retail (DO NOT EDIT owners nophakun and tianyu)        50
test-liyuhz                                           392
dtype: int64
trafficSource.campaignCode
11251kjhkvahf    1
dtype: int64
trafficSource.isTrueDirect
True    274005
dtype: int64
trafficSource.keyword
"google store" refund cancel

In [7]:
# you are left with 31 columns
train_set.columns

Index(['channelGrouping', 'date', 'fullVisitorId', 'sessionId', 'visitId',
       'visitNumber', 'visitStartTime', 'device.browser',
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
       'geoNetwork.subContinent', 'totals.hits', 'totals.pageviews',
       'totals.transactionRevenue', 'trafficSource.adContent',
       'trafficSource.adwordsClickInfo.adNetworkType',
       'trafficSource.adwordsClickInfo.gclId',
       'trafficSource.adwordsClickInfo.page',
       'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign',
       'trafficSource.keyword', 'trafficSource.medium',
       'trafficSource.referralPath', 'trafficSource.source'],
      dtype='object')

In [8]:
# convert the int64 datatype of date to meaningful dates of typedatetime64
train_set.date = pd.to_datetime(train_set.date,format='%Y%m%d')

In [9]:
train_set.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,...,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,2016-09-02,1.13166e+18,1131660440785968503_1472830385,1472830385,1,1472830385,Chrome,desktop,False,...,,,,,,(not set),(not provided),organic,,google
1,Organic Search,2016-09-02,3.77306e+17,377306020877927890_1472880147,1472880147,1,1472880147,Firefox,desktop,False,...,,,,,,(not set),(not provided),organic,,google
2,Organic Search,2016-09-02,3.89555e+18,3895546263509774583_1472865386,1472865386,1,1472865386,Chrome,desktop,False,...,,,,,,(not set),(not provided),organic,,google
3,Organic Search,2016-09-02,4.76345e+18,4763447161404445595_1472881213,1472881213,1,1472881213,UC Browser,desktop,False,...,,,,,,(not set),google + online,organic,,google
4,Organic Search,2016-09-02,2.72944e+16,27294437909732085_1472822600,1472822600,2,1472822600,Chrome,mobile,True,...,,,,,,(not set),(not provided),organic,,google


In [10]:
train_set.dtypes


channelGrouping                                         object
date                                            datetime64[ns]
fullVisitorId                                           object
sessionId                                               object
visitId                                                  int64
visitNumber                                              int64
visitStartTime                                           int64
device.browser                                          object
device.deviceCategory                                   object
device.isMobile                                           bool
device.operatingSystem                                  object
geoNetwork.city                                         object
geoNetwork.continent                                    object
geoNetwork.country                                      object
geoNetwork.metro                                        object
geoNetwork.networkDomain                               

In [14]:
train_set.date.dtype, train_set.visitId.dtype

(dtype('<M8[ns]'), dtype('int64'))

In [24]:
# DATE_COLUMNS = [weekofyear, is_month_start, is_month_end, is_quarter_start, is_quarter_end, is_year_start, is_year_end, year, month, day]
# pd.get_dummies(train_set.date.dt.year)
# for column in DATE_COLUMNS:
#     frame = pd.DataFrame()
#     train_set = pd.concat([train_set, frame], axis=1)

frame = pd.DataFrame(train_set.date.dt.dayofyear)
frame = frame.rename(index=str, columns={"date": "date_dayofyear"})
frame

Unnamed: 0,date_dayofyear
0,246
1,246
2,246
3,246
4,246
5,246
6,246
7,246
8,246
9,246


In [25]:
train_set = pd.concat([train_set, frame], axis=1)

  return self._int64index.union(other)


In [26]:
train_set.shape

(1807306, 33)

In [28]:
train_set.groupby("date_dayofyear").size()

date_dayofyear
1.0      1364
2.0      1620
3.0      2403
4.0      2390
5.0      2193
6.0      2210
7.0      1615
8.0      1637
9.0      2308
10.0     2268
11.0     2185
12.0     2203
13.0     1967
14.0     1526
15.0     1576
16.0     1906
17.0     2443
18.0     2245
19.0     2083
20.0     2074
21.0     1685
22.0     1700
23.0     2262
24.0     3618
25.0     2986
26.0     2209
27.0     1969
28.0     1614
29.0     1814
30.0     2392
         ... 
337.0    3751
338.0    3044
339.0    3220
340.0    4265
341.0    3021
342.0    2840
343.0    3013
344.0    2830
345.0    2207
346.0    2208
347.0    3433
348.0    3166
349.0    3040
350.0    2937
351.0    2956
352.0    2324
353.0    2128
354.0    3222
355.0    3110
356.0    2594
357.0    2100
358.0    1604
359.0    1231
360.0    1386
361.0    1586
362.0    1855
363.0    1763
364.0    1647
365.0    1232
366.0    1211
Length: 366, dtype: int64

In [49]:
train_set.drop(train_set.date, axis=1, inplace=True)


In [22]:
train_set.drop(train_set["date.dayofyear"], axis=1, inplace=True)

KeyError: '[nan nan nan ... nan nan nan] not found in axis'