# Set up the CSVs

In [22]:
import os

# Create a list of all CSVs in the directory to iterate over and make
# dataframes from.

dir_name = 'C:/Users/riley/Documents/Coding/DSC/datas/data_to_iterate/'

# List of all .csv filenames to be read into dataframes
csv_list = os.listdir(dir_name)

# Build a list of dataframe names by removing .csv
df_names = []

for csv in csv_list:
    df_name = csv.replace('.csv', '')
    df_names.append(df_name)

# Put them in a dict and concatenate

In [23]:
# Names of DFs and the filenames
df_dict_names = dict(zip(df_names, csv_list))
df_dict_names

{'201801-fordgobike-tripdata': '201801-fordgobike-tripdata.csv',
 '201802-fordgobike-tripdata': '201802-fordgobike-tripdata.csv',
 '201803-fordgobike-tripdata': '201803-fordgobike-tripdata.csv',
 '201804-fordgobike-tripdata': '201804-fordgobike-tripdata.csv',
 '201805-fordgobike-tripdata': '201805-fordgobike-tripdata.csv',
 '201806-fordgobike-tripdata': '201806-fordgobike-tripdata.csv',
 '201807-fordgobike-tripdata': '201807-fordgobike-tripdata.csv',
 '201808-fordgobike-tripdata': '201808-fordgobike-tripdata.csv',
 '201809-fordgobike-tripdata': '201809-fordgobike-tripdata.csv',
 '201810-fordgobike-tripdata': '201810-fordgobike-tripdata.csv',
 '201811-fordgobike-tripdata': '201811-fordgobike-tripdata.csv',
 '201812-fordgobike-tripdata': '201812-fordgobike-tripdata.csv'}

In [25]:
import pandas as pd
import random
p = 0.1

# Create a dictionary of dataframes to then concatenate into one dataframe

dict_of_dataframes = {}
for i in range(len(df_names)):
    dict_of_dataframes[df_names[i]] = pd.read_csv(dir_name+csv_list[i], skiprows=lambda i: i>0 and random.random() > p)

In [26]:
master_df = pd.concat(dict_of_dataframes.values())

# Data cleaning

In [27]:
import numpy as np

# Get min from seconds column
master_df['duration_min'] = np.round(master_df['duration_sec'] / 60, 0)

In [28]:
# Use pd.to_datetime to convert the time strings to datetime objects
# for easier time series analyses

master_df['start_time'] = pd.to_datetime(master_df['start_time'])
master_df['end_time'] = pd.to_datetime(master_df['end_time'])

In [29]:
# Encode the different categorical genders to integers
genders = {'Male': 0, 'Female':1, 'Other':2}

# Use replace to replace each key in genders dict with corresponding value
master_df['member_gender'].replace(genders, inplace=True);

In [30]:
import datetime

# Will now interpolate the member_birth_year with the mean birth year
# Also, I'm gonna get their age by subtracting now() from their birth year
master_df['member_birth_year'].fillna(np.mean(master_df['member_birth_year']), inplace=True)

# That got years as floats so I'm changing it to int:
master_df['member_birth_year'] = master_df['member_birth_year'].apply(np.int64)

# Now let's get the current year and subtract that from birth year to get age.
now = datetime.datetime.now()
master_df['age'] = now.year - master_df['member_birth_year']

In [38]:
# Apply the price calculator to separate dfs, one for each customer type
# this is pretty fast computationally

import math as m


customers = master_df[master_df['user_type'] == 'Customer']

cost_calc = lambda minute: 2 + m.ceil(max((minute - 30), 0) / 15) * 3

customers['cost_to_ride'] = customers['duration_min'].apply(cost_calc)

subscribers = master_df[master_df['user_type'] == 'Subscriber']

cost_calc = lambda minute: m.ceil(max((minute - 45), 0) / 15) * 3

subscribers['cost_to_ride'] = subscribers['duration_min'].apply(cost_calc)

master_df = pd.concat([subscribers, customers])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [39]:
master_df.head()

Unnamed: 0,duration_sec,start_time,end_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,member_birth_year,member_gender,bike_share_for_all_trip,duration_min,age,cost_to_ride
0,686,2018-01-31 23:07:15.313,2018-01-31 23:18:41.558,312.0,San Jose Diridon Station,37.329732,-121.901782,317.0,San Salvador St at 9th St,37.333955,-121.877349,1886,Subscriber,1997,1.0,No,11.0,22,0
2,2219,2018-01-31 22:24:39.943,2018-01-31 23:01:39.571,30.0,San Francisco Caltrain (Townsend St at 4th St),37.776598,-122.395282,81.0,Berry St at 4th St,37.77588,-122.39317,1757,Subscriber,1991,1.0,No,37.0,28,0
3,307,2018-01-31 22:37:37.815,2018-01-31 22:42:44.948,13.0,Commercial St at Montgomery St,37.794231,-122.402923,9.0,Broadway at Battery St,37.798572,-122.400869,3111,Subscriber,1985,0.0,No,5.0,34,0
4,976,2018-01-31 22:25:48.938,2018-01-31 22:42:05.135,106.0,Sanchez St at 17th St,37.763242,-122.430675,80.0,Townsend St at 5th St,37.775306,-122.39738,2014,Subscriber,1966,0.0,No,16.0,53,0
5,520,2018-01-31 22:32:52.933,2018-01-31 22:41:33.147,146.0,30th St at San Jose Ave,37.742314,-122.42318,98.0,Valencia St at 16th St,37.765052,-122.421866,1278,Subscriber,1992,0.0,No,9.0,27,0


In [40]:
cols_to_drop = ['member_birth_year',
               'start_station_name',
               'end_station_name',
               'duration_sec',
               'bike_share_for_all_trip']

master_df.drop(cols_to_drop, axis=1, inplace=True)

### Write to disc

In [41]:
# move out to the data folder
os.chdir('../datas')

master_df.to_csv('C:/Users/riley/Documents/Coding/DSC/datas/last_step_data_cleaning/master_df.csv')