In [1]:
# Import packages
import math
import scipy.stats as stats
import numpy as np
import pandas as pd
import glob

# Set display option for floats in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("ticks")
sns.set(color_codes=True, font_scale=1.25)

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.tile_providers import CARTODBPOSITRON, get_provider
from bokeh.models import ColumnDataSource, HoverTool

# Setup Bokeh to output directly to the notebook
output_notebook(resources=None, verbose=False, hide_banner=True, load_timeout=5000, notebook_type='jupyter')

In [2]:
# # Read in the data and concatentate into a single df
# filenames = glob.glob('../data/' + "*.csv")
# dfs = []
# for filename in filenames:
#     dfs.append(pd.read_csv(filename))
# df = pd.concat(dfs, ignore_index=True)

In [3]:
# Read in the rentals data and review
df = pd.read_csv('../data/201901-citibike-tripdata.csv')
print("Rental data - rows, columns:" + str(df.shape))

Rental data - rows, columns:(967287, 15)


In [4]:
df.isna().sum()

tripduration                0
starttime                   0
stoptime                    0
start station id           18
start station name         18
start station latitude      0
start station longitude     0
end station id             18
end station name           18
end station latitude        0
end station longitude       0
bikeid                      0
usertype                    0
birth year                  0
gender                      0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
print("Rental data - rows, columns:" + str(df.shape))

Rental data - rows, columns:(967269, 15)


In [7]:
len(df['start station id'].unique())

767

In [8]:
df['start station id'] = df['start station id'].astype(int)
df['end station id'] = df['end station id'].astype(int)

In [9]:
len(df['start station id'].unique())

767

In [10]:
# df.astype({'start station id': 'int64', 'end station id': 'int64',
#            'bikeid': 'int16', 'birth year': 'int16', 'gender': 'int8'}).dtypes

In [11]:
# Create a df to store station related info
cols = [['id', 'name', 'lat', 'lon']]
start_stations = df[['start station id', 'start station name', 'start station latitude', 'start station longitude']]
start_stations.columns = cols
end_stations = df[['end station id', 'end station name', 'end station latitude', 'end station longitude']]
end_stations.columns = cols


In [12]:
start_stations.isna().sum()

id      0
name    0
lat     0
lon     0
dtype: int64

In [13]:
start_stations = start_stations.drop_duplicates()
start_stations.shape

(767, 4)

In [14]:
end_stations = end_stations.drop_duplicates()
end_stations.shape

(773, 4)

In [15]:
stations = pd.concat([start_stations, end_stations], ignore_index=True)
stations.dtypes

id        int64
name     object
lat     float64
lon     float64
dtype: object

In [16]:
stations = stations.drop_duplicates()

In [17]:
stations.shape

(773, 4)

In [18]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 967269 entries, 0 to 967286
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   tripduration             967269 non-null  int64  
 1   starttime                967269 non-null  object 
 2   stoptime                 967269 non-null  object 
 3   start station id         967269 non-null  int64  
 4   start station name       967269 non-null  object 
 5   start station latitude   967269 non-null  float64
 6   start station longitude  967269 non-null  float64
 7   end station id           967269 non-null  int64  
 8   end station name         967269 non-null  object 
 9   end station latitude     967269 non-null  float64
 10  end station longitude    967269 non-null  float64
 11  bikeid                   967269 non-null  int64  
 12  usertype                 967269 non-null  object 
 13  birth year               967269 non-null  int64  
 14  gend

In [19]:
df = df.drop(['start station name', 'start station latitude', 'start station longitude',
              'end station name', 'end station latitude', 'end station longitude'], axis=1)

In [28]:
df.astype({'bikeid': 'int16', 'birth year': 'int16', 'gender': 'int8', 'usertype': 'category'}).dtypes

tripduration                 int64
starttime           datetime64[ns]
stoptime            datetime64[ns]
start station id             int64
end station id               int64
bikeid                       int16
usertype                  category
birth year                   int16
gender                        int8
dtype: object

In [29]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 967269 entries, 0 to 967286
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   tripduration      967269 non-null  int64         
 1   starttime         967269 non-null  datetime64[ns]
 2   stoptime          967269 non-null  datetime64[ns]
 3   start station id  967269 non-null  int64         
 4   end station id    967269 non-null  int64         
 5   bikeid            967269 non-null  int64         
 6   usertype          967269 non-null  object        
 7   birth year        967269 non-null  int64         
 8   gender            967269 non-null  int64         
dtypes: datetime64[ns](2), int64(6), object(1)
memory usage: 128.1 MB


In [30]:
# Convert start_date and end_date to datetime format
df.starttime = pd.to_datetime(df.starttime, infer_datetime_format=True)
df.stoptime = pd.to_datetime(df.stoptime, infer_datetime_format=True)

In [31]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
    if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)                    
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    df[col] = df[col].astype(np.uint64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

df = reduce_mem_usage(df)

Memory usage of dataframe is 73.80 MB
Memory usage after optimization is: 67.34 MB
Decreased by 8.8%
