In [1]:
import pandas as pd
import numpy as np
import re
from pandas.io.json import json_normalize
from pandas.api.types import is_string_dtype

In [2]:
PATH = '../data/'

###### todo:

1. <s> Figure out how to get stuff out of the JSON columns </s>
2. Run Pandas profiling on the data (separate notebook)
3. Prepare the dependent variable so it predicts what it needs to 
4. <s> Make sure categorical variables are correctly encoded </s>
5. <s> one-hot encode variables with cardinality <7 </s>
6. <s> Get variables out of the date variable </s>
7. Deal with missing values
7. Run a baseline model

minor:
  - get nbwidgets so I can be more organised with chapters


In [3]:
# 1. Load the data

In [4]:
f'{PATH}train.csv'

'../data/train.csv'

In [5]:
# Get data hidden inside json columns out into separate columns 
# credit: julian3833 https://www.kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields

In [6]:
JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']

In [7]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

def load_df(csv_path=f'{PATH}train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [8]:
# df = load_df()

In [9]:
df = load_df(nrows=2000)


Loaded train.csv. Shape: (2000, 54)


In [11]:
def add_datepart(df, fldname, drop=True, time=False):
    
    fld = df[fldname]
    fld_dtype = fld.dtype

    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
        
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: 
        attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: 
        df[targ_pre + n] = getattr(fld.dt, n.lower())
        
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    
    if drop: 
        df.drop(fldname, axis=1, inplace=True)

In [12]:
add_datepart(df, 'date')


In [13]:
#Converting variables with string values into categories.

In [14]:
cat_vars = []
for n,c in df.items():
    if is_string_dtype(c): 
        cat_vars.append(n)
        df[n] = c.astype('category').cat.as_ordered()


In [17]:
# A quick look at the missing values

In [18]:
df.isnull().sum().sort_index()/len(df)

Day                                                  0.0000
Dayofweek                                            0.0000
Dayofyear                                            0.0000
Elapsed                                              0.0000
Is_month_end                                         0.0000
Is_month_start                                       0.0000
Is_quarter_end                                       0.0000
Is_quarter_start                                     0.0000
Is_year_end                                          0.0000
Is_year_start                                        0.0000
Month                                                0.0000
Week                                                 0.0000
Year                                                 0.0000
channelGrouping                                      0.0000
device.browser                                       0.0000
device.browserSize                                   0.0000
device.browserVersion                   

In [19]:
# one-hot encoding categorical variables with cardinality <= 15.

In [24]:
def dummify(df, min_cardinality=0, max_cardinality=15):

    to_dummify = []
    
    for n,c in df.items():
        
        if str(df.dtypes[n]) == 'category':
            if ((len(df[n].cat.categories)) > min_cardinality) \
            & ((len(df[n].cat.categories)) <= max_cardinality):
                to_dummify.append(n)
                
    dummified = pd.get_dummies(df[to_dummify], dummy_na=True)
    dummified = pd.concat([dummified, df], axis=1)
    dummified_df = dummified.drop(to_dummify, axis=1)
    
    return to_dummify, dummified_df

In [25]:
dumd_vars, df = dummify(df)

['channelGrouping', 'socialEngagementType', 'device.browser', 'device.browserSize', 'device.browserVersion', 'device.deviceCategory', 'device.flashVersion', 'device.language', 'device.mobileDeviceBranding', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.operatingSystem', 'device.operatingSystemVersion', 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 'geoNetwork.continent', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.metro', 'geoNetwork.networkLocation', 'totals.bounces', 'totals.newVisits', 'totals.visits', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.criteriaParameters', 'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign', 'trafficSource.isTrueDirect', 'trafficSource.medium']


In [33]:
#These are the remaining, not dummified categorical variables.
#Let's convert them to ints (otherwise sklearn's random forest is grumpy)

remaining_cat_vars = (list(set(cat_vars) - set(dumd_vars))) 

for var in remaining_cat_vars:
    df[var] = df[var].cat.codes

In [34]:
#this is the final list of our columns
[col for col in df.columns]

['channelGrouping_Affiliates',
 'channelGrouping_Direct',
 'channelGrouping_Display',
 'channelGrouping_Organic Search',
 'channelGrouping_Paid Search',
 'channelGrouping_Referral',
 'channelGrouping_Social',
 'channelGrouping_nan',
 'socialEngagementType_Not Socially Engaged',
 'socialEngagementType_nan',
 'device.browser_ADM',
 'device.browser_BlackBerry',
 'device.browser_Chrome',
 'device.browser_Coc Coc',
 'device.browser_Edge',
 'device.browser_Firefox',
 'device.browser_Internet Explorer',
 'device.browser_Mozilla Compatible Agent',
 'device.browser_Opera',
 'device.browser_Opera Mini',
 'device.browser_Safari',
 'device.browser_Safari (in-app)',
 'device.browser_UC Browser',
 'device.browser_nan',
 'device.browserSize_not available in demo dataset',
 'device.browserSize_nan',
 'device.browserVersion_not available in demo dataset',
 'device.browserVersion_nan',
 'device.deviceCategory_desktop',
 'device.deviceCategory_mobile',
 'device.deviceCategory_tablet',
 'device.deviceCate

In [None]:
Use the last 3 months of the training set as validation

In [36]:
df.year

AttributeError: 'DataFrame' object has no attribute 'year'

In [None]:
df, y = 

In [37]:
df.Elapsed

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
1970    0
1971    0
1972    0
1973    0
1974    0
1975    0
1976    0
1977    0
1978    0
1979    0
1980    0
1981    0
1982    0
1983    0
1984    0
1985    0
1986    0
1987    0
1988    0
1989    0
1990    0
1991    0
1992    0
1993    0
1994    0
1995    0
1996    0
1997    0
1998    0
1999    0
Name: Elapsed, Length: 2000, dtype: int64