# Data cleanig
### Bike rides for the past 12 months


In [1]:
import pandas as pd
import os
import sqlite3
import geopy.distance

### Merge raw data

Merge all the tables into a single dataframe

Each table corresponds to a month of the past year

In [2]:
# ride_data is the folder where all the data is stored
cd = os.getcwd()
directory = "{}\\ride_data".format(cd)

In [3]:
# Creates the DF
df = pd.read_csv(directory+"\\divvy_trip (1).csv")

In [4]:
# Loop through the last 11 months of Ride Data and append to the existing df
for file in os.listdir(directory):
    if not directory+"\\divvy_trip (1).csv" == directory+"\\"+file:
        df = df.append(pd.DataFrame(pd.read_csv(directory+"\\"+file)), ignore_index=True)
print('done')

done


In [5]:
df.dropna(inplace=True)

In [6]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4641395 entries, 0 to 5723481
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 3.0 GB


In [7]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,47EC0A7F82E65D52,classic_bike,2022-03-21 13:45:01,2022-03-21 13:51:18,Wabash Ave & Wacker Pl,TA1307000131,Kingsbury St & Kinzie St,KA1503000043,41.886875,-87.62603,41.889177,-87.638506,member
1,8494861979B0F477,electric_bike,2022-03-16 09:37:16,2022-03-16 09:43:34,Michigan Ave & Oak St,13042,Orleans St & Chestnut St (NEXT Apts),620,41.900998,-87.623752,41.898203,-87.637536,member
2,EFE527AF80B66109,classic_bike,2022-03-23 19:52:02,2022-03-23 19:54:48,Broadway & Berwyn Ave,13109,Broadway & Ridge Ave,15578,41.978353,-87.659753,41.984045,-87.660274,member
3,9F446FD9DEE3F389,classic_bike,2022-03-01 19:12:26,2022-03-01 19:22:14,Wabash Ave & Wacker Pl,TA1307000131,Franklin St & Jackson Blvd,TA1305000025,41.886875,-87.62603,41.877708,-87.635321,member
4,431128AD9AFFEDC0,classic_bike,2022-03-21 18:37:01,2022-03-21 19:19:11,DuSable Lake Shore Dr & North Blvd,LF-005,Loomis St & Jackson Blvd,13206,41.911722,-87.626804,41.877945,-87.662007,member


### Date convertion
* Converting dates into seconds
* New column: weekday >> 0 is Monday, 6 is Sunday
* New column: duration_ns >> end - start in nanoseconds
* Remove ride durations < 1min and > 24hrs

In [8]:
df.loc[ : , "started_at"] = pd.to_datetime(df.loc[ : , "started_at"], yearfirst=True)
df.loc[ : , "ended_at"] = pd.to_datetime(df.loc[ : , "ended_at"], yearfirst=True)
df['weekday'] = df.started_at.dt.dayofweek

In [9]:
df['duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds()

In [10]:
df = df[df['duration'] >= 60]        # 1 minute
df = df[df['duration'] <= 24*60*60]  # 24 hours

In [11]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4579009 entries, 0 to 5723481
Data columns (total 15 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
 13  weekday             int64         
 14  duration            float64       
dtypes: datetime64[ns](2), float64(5), int64(1), object(7)
memory usage: 2.4 GB


In [12]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,weekday,duration
0,47EC0A7F82E65D52,classic_bike,2022-03-21 13:45:01,2022-03-21 13:51:18,Wabash Ave & Wacker Pl,TA1307000131,Kingsbury St & Kinzie St,KA1503000043,41.886875,-87.62603,41.889177,-87.638506,member,0,377.0
1,8494861979B0F477,electric_bike,2022-03-16 09:37:16,2022-03-16 09:43:34,Michigan Ave & Oak St,13042,Orleans St & Chestnut St (NEXT Apts),620,41.900998,-87.623752,41.898203,-87.637536,member,2,378.0
2,EFE527AF80B66109,classic_bike,2022-03-23 19:52:02,2022-03-23 19:54:48,Broadway & Berwyn Ave,13109,Broadway & Ridge Ave,15578,41.978353,-87.659753,41.984045,-87.660274,member,2,166.0
3,9F446FD9DEE3F389,classic_bike,2022-03-01 19:12:26,2022-03-01 19:22:14,Wabash Ave & Wacker Pl,TA1307000131,Franklin St & Jackson Blvd,TA1305000025,41.886875,-87.62603,41.877708,-87.635321,member,1,588.0
4,431128AD9AFFEDC0,classic_bike,2022-03-21 18:37:01,2022-03-21 19:19:11,DuSable Lake Shore Dr & North Blvd,LF-005,Loomis St & Jackson Blvd,13206,41.911722,-87.626804,41.877945,-87.662007,member,0,2530.0


### Get unique stations

Filter only unique stations


In [13]:
all_stations = pd.DataFrame(columns=['station_name', 'lat', 'lng'])

In [14]:
started_stations = df[['start_station_name', 'start_lat', 'start_lng']]
started_stations.rename(inplace=True, columns={'start_station_name': 'station_name', 
                                               'start_lat': 'lat', 'start_lng': 'lng'} )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [15]:
ended_stations = df[['end_station_name', 'end_lat', 'end_lng']]
ended_stations.rename(inplace=True, columns={'end_station_name': 'station_name', 
                                             'end_lat': 'lat', 'end_lng': 'lng'} )

In [16]:
all_stations = all_stations.append(started_stations)
all_stations = all_stations.append(ended_stations)

In [17]:
all_stations.drop_duplicates(inplace=True, subset='station_name')

In [18]:
all_stations

Unnamed: 0,station_name,lat,lng
0,Wabash Ave & Wacker Pl,41.886875,-87.626030
1,Michigan Ave & Oak St,41.900998,-87.623752
2,Broadway & Berwyn Ave,41.978353,-87.659753
4,DuSable Lake Shore Dr & North Blvd,41.911722,-87.626804
5,Bissell St & Armitage Ave,41.918018,-87.652182
...,...,...,...
2995292,Indiana Ave & 133rd St,41.650000,-87.620000
4177511,351,41.930000,-87.780000
4906316,WEST CHI-WATSON,41.894702,-87.730884
4954339,DIVVY CASSETTE REPAIR MOBILE STATION,41.880958,-87.616743


### Stations' distance
Calculate distance using geografical coordanates

In [19]:
# Calculate the distance between stations

def station_dist(list_coord):
    # list_coord: start lat, start lng, end lat, end lng
    
    # To float
    coord = [float(coord) for coord in list_coord]
    
    # start and end coord must be a tuple of (lat, long)
    dist = geopy.distance.distance((coord[0], coord[1]), (coord[2], coord[3])).m
    return int(round(dist, 0))

In [20]:
df['stations_distance_m'] = pd.read_csv("stations_distance.csv", header=None)

## Dataframe creation steps-by-step
#### We are going to separate this data into mulitple tables

* rides
* ride_duration  
* stations 

In [21]:
df_rides = df[['ride_id', 'rideable_type', 'start_station_name',
               'end_station_name', 'stations_distance_m', 'member_casual']]
df_rides

Unnamed: 0,ride_id,rideable_type,start_station_name,end_station_name,stations_distance_m,member_casual
0,47EC0A7F82E65D52,classic_bike,Wabash Ave & Wacker Pl,Kingsbury St & Kinzie St,0.0,member
1,8494861979B0F477,electric_bike,Michigan Ave & Oak St,Orleans St & Chestnut St (NEXT Apts),1067.0,member
2,EFE527AF80B66109,classic_bike,Broadway & Berwyn Ave,Broadway & Ridge Ave,1185.0,member
3,9F446FD9DEE3F389,classic_bike,Wabash Ave & Wacker Pl,Franklin St & Jackson Blvd,634.0,member
4,431128AD9AFFEDC0,classic_bike,DuSable Lake Shore Dr & North Blvd,Loomis St & Jackson Blvd,1277.0,member
...,...,...,...,...,...,...
5723378,1B88F66E86C094DB,classic_bike,Clark St & Leland Ave,Clark St & Leland Ave,,member
5723400,E73A038DA647AAFF,docked_bike,Michigan Ave & Oak St,Michigan Ave & Oak St,,casual
5723421,4AD181F39CCB99ED,classic_bike,Kingsbury St & Kinzie St,Desplaines St & Kinzie St,,member
5723462,D6AE7BEA1D494E4B,classic_bike,Michigan Ave & Oak St,Michigan Ave & Oak St,,member


In [22]:
df_ride_duration = df[['ride_id', 'member_casual', 'started_at', 'ended_at', 'duration', 'weekday']]
df_ride_duration

Unnamed: 0,ride_id,member_casual,started_at,ended_at,duration,weekday
0,47EC0A7F82E65D52,member,2022-03-21 13:45:01,2022-03-21 13:51:18,377.0,0
1,8494861979B0F477,member,2022-03-16 09:37:16,2022-03-16 09:43:34,378.0,2
2,EFE527AF80B66109,member,2022-03-23 19:52:02,2022-03-23 19:54:48,166.0,2
3,9F446FD9DEE3F389,member,2022-03-01 19:12:26,2022-03-01 19:22:14,588.0,1
4,431128AD9AFFEDC0,member,2022-03-21 18:37:01,2022-03-21 19:19:11,2530.0,0
...,...,...,...,...,...,...
5723378,1B88F66E86C094DB,member,2021-07-18 11:51:19,2021-07-18 12:36:24,2705.0,6
5723400,E73A038DA647AAFF,casual,2021-07-23 11:30:06,2021-07-23 12:37:25,4039.0,4
5723421,4AD181F39CCB99ED,member,2021-07-12 17:42:02,2021-07-12 17:45:44,222.0,0
5723462,D6AE7BEA1D494E4B,member,2021-07-17 17:01:24,2021-07-17 17:38:14,2210.0,5


In [23]:
df_stations = all_stations
df_stations

Unnamed: 0,station_name,lat,lng
0,Wabash Ave & Wacker Pl,41.886875,-87.626030
1,Michigan Ave & Oak St,41.900998,-87.623752
2,Broadway & Berwyn Ave,41.978353,-87.659753
4,DuSable Lake Shore Dr & North Blvd,41.911722,-87.626804
5,Bissell St & Armitage Ave,41.918018,-87.652182
...,...,...,...
2995292,Indiana Ave & 133rd St,41.650000,-87.620000
4177511,351,41.930000,-87.780000
4906316,WEST CHI-WATSON,41.894702,-87.730884
4954339,DIVVY CASSETTE REPAIR MOBILE STATION,41.880958,-87.616743


### Export to a SQL database

In [24]:
con = sqlite3.connect('divy.db')

In [25]:
# Save the data
df_rides.to_sql("Rides", index=False, con=con)
df_ride_duration.to_sql("Rides_duration", index=False, con=con)
df_stations.to_sql("Stations", index=False, con=con)