In [1]:
PROJECT_ID = "formazione-riccardo-zanella"
REGION = 'us-central1'
BUCKET_NAME = "bbs-2021-opml4b-explainability"

In [2]:
# copy data to local
! gsutil cp 'gs://explanations_sample_data/bike-data.csv' ../../data/tabular_data

Copying gs://explanations_sample_data/bike-data.csv...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").


Operation completed over 1 objects/265.3 MiB.                                    


In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('../../data/tabular_data/bike-data.csv')

# Shuffle the data
data = data.sample(frac=1, random_state=2)

# Drop rows with null values / outliers
data = data[data['dewp'] != 9999.9]
data = data[data['duration'] < 120*60]

# Rename some columns for readability
data = data.rename(columns={'day_of_week': 'weekday'})
data = data.rename(columns={'max': 'max_temp'})
data = data.rename(columns={'dewp': 'dew_point'})

# Drop columns you won't use to train this model
data = data.drop(columns=['prcp', 'wdsp', 'fog', 'rain_drizzle', 'start_station_name', 'end_station_name', 'bike_id', 'snow_ice_pellets'])

# Convert trip duration from seconds to minutes so it's easier to understand
data['duration'] = data['duration'].apply(lambda x: float(x / 60))

Let's take a look at the first five rows of your data in the panda dataframe

In [5]:
# Preview the first 5 rows
data.head()

Unnamed: 0,start_hr,weekday,euclidean,temp,dew_point,max_temp,duration
2257298,15,1,1213.565222,66.6,60.0,70.2,18.0
1559391,21,4,582.72488,54.7,46.2,56.3,22.0
1881386,16,2,2358.922742,57.1,48.2,64.9,12.0
703461,23,1,766.066253,69.2,51.6,80.4,12.0
831873,21,3,731.937287,53.2,47.4,58.3,4.0


Next, you will separate the data into features ('data') and labels ('labels')

### Split data into train and test sets

You'll split your data into train and test sets using an 80 / 20 train / test split.

In [6]:
# Use 80/20 train/test split
train_size = int(len(data) * .8)
print("Train size: %d" % train_size)
print("Test size: %d" % (len(data) - train_size))

# Split your data into train and test sets
train_data = data[:train_size]

test_data = data[train_size:]

Train size: 1781701
Test size: 445426


In [7]:
print(train_size)

1781701


In [8]:
train_data.to_csv('../../data/tabular_data/train.csv', encoding='utf-8', index=False)
test_data.to_csv('../../data/tabular_data/test.csv', encoding='utf-8', index=False)

In [9]:
# upload
! gsutil -m cp ../../data/tabular_data/train.csv ../../data/tabular_data/test.csv "gs://$BUCKET_NAME/data/tabular_data"

Copying file://../../data/tabular_data/train.csv [Content-Type=text/csv]...
Copying file://../../data/tabular_data/test.csv [Content-Type=text/csv]...

Operation completed over 2 objects/89.9 MiB.                                     
