In [1]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
from IPython.display import Image, display
from pandas.tools.plotting import scatter_matrix 
%matplotlib inline


import category_encoders as ce

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


hits-00000.csv written (9412 cumulative rows)


The total size of the training dataset is ~24GB so let's load it in small chunks and perform data cleanup. 

Some of the columns are in JSON format, we need to convert all the json fields in the file to a flattened csv format and create a dataframe without overshooting the available memory. 

In [None]:
raw_data = pd.DataFrame()
chunksize = 5000
counter = 0
csv_path = "D:\\tempd\\train_v2.csv"

JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']

chunt_itr = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     chunksize=chunksize)

for chunk in chunt_itr:
    process(chunk)
print('done chunking')

In [None]:
# Process each chunk to filter data for only 2016 and append to a dataframe
def process(chunk):
    global raw_data
    filtered_chunk = chunk[(chunk.date > 20160000) & (chunk.date < 20170000)]
    raw_data = raw_data.append(filtered_chunk, ignore_index=True)

In [None]:
for column in JSON_COLUMNS:
    column_as_df = json_normalize(raw_data[column])
    column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
    raw_data = raw_data.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

In [None]:
raw_data.to_csv("train16.csv", index=False)

In [None]:
#raw_data = pd.read_csv('train16.csv')

In [None]:
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
gdf = train_df.groupby("fullVisitorId")["totals.transactionRevenue"].sum().reset_index()

#Plot the distribution

#plt.figure(figsize=(8,6))
#plt.scatter(range(gdf.shape[0]), np.sort(np.log1p(gdf["totals.transactionRevenue"].values)))
#plt.xlabel('index', fontsize=12)
#plt.ylabel('TransactionRevenue', fontsize=12)
#plt.show()

nzi = pd.notnull(train_df["totals.transactionRevenue"]).sum()
nzr = (gdf["totals.transactionRevenue"]>0).sum()
print("Number of instances in train set with non-zero revenue : ", nzi, " and ratio is : ", nzi / train_df.shape[0])
print("Number of unique customers with non-zero revenue : ", nzr, "and the ratio is : ", nzr / gdf.shape[0])

Columns with constant values:

Looks like there are quite a few features with constant value in the train set. Let us get the list of these features. Columns with constant and null values do not help us with the prediction.

In [None]:
const_cols = [c for c in train_df.columns if train_df[c].nunique(dropna=False)==1 ]
const_cols

In [None]:
train_df.head()
print(train_df.shape)

In [None]:
cols_to_drop = const_cols
train_df = train_df.drop(cols_to_drop + ["trafficSource.campaignCode"], axis=1)

In [None]:
train_df["totals.transactionRevenue"].fillna(0, inplace=True)

In [None]:
train_df.info()

There are a few categorical columns that we need to encode to help with the prediction

In [None]:
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']

In [None]:
### Feature selection and encoding
y = train_df['totals.transactionRevenue']

ce_ord = ce.OrdinalEncoder(cols = cat_cols, handle_unknown='impute')

train_df = ce_ord.fit_transform(train_df, y)

In [None]:
train_df.info()

Some columns do not have the right data type selected for them, let's fix this:

In [None]:
num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']    
for col in num_cols:
    train_df[col] = train_df[col].astype(float)

In [None]:
train_df.to_csv("train-flattened-drop.csv", index=False)

In [None]:
raw_data = pd.read_csv('train-flattened-drop.csv', 
                     dtype={'fullVisitorId': 'str'})

In [None]:
train_df = raw_data

### TODO
Columns customDimensions and hits do not seem to add a lot of information, let's ignore them for now and we can come back to this later

In [None]:
train_df.drop(['customDimensions'], axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
train_df.drop(['hits'], axis=1, inplace=True)

In [None]:
train_df.info()

### Save the file

In [None]:
train_df.to_csv("clean16.csv", index=False)