In [1]:
import pandas as pd
import os
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np

# 1. Data Preparation (cleaning and engineering)

# 1.1. Dublin Bikes

First we will loop through the files in order to concatenate the 12 complementary files into a single dataframe for the whole year of 2023.
Dataset available at: 

Dataset available at: https://data.smartdublin.ie/dataset/dublinbikes-api


Font: https://medium.com/@nawazmohtashim/method-to-merge-csv-files-in-python-8b0f16550e0b

In [5]:
# Listing the files and sorting them by name in the dublin_data folder so they do not have to be typed one by one. Adapted from: https://stevenhough.medium.com/how-to-easily-list-all-files-in-a-folder-using-python-3-ee06004c6316

folder_path = '/home/user/Documents/GitHub/dublin_data'

file_list = sorted([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

print(file_list)

['.DS_Store', '._.DS_Store', '._dublinbike-historical-data-2022-01.csv', '._dublinbike-historical-data-2022-02.csv', '._dublinbike-historical-data-2022-03.csv', '._dublinbike-historical-data-2022-04.csv', '._dublinbike-historical-data-2022-05.csv', '._dublinbike-historical-data-2022-06.csv', '._dublinbike-historical-data-2022-07.csv', '._dublinbike-historical-data-2022-08.csv', '._dublinbike-historical-data-2022-09.csv', '._dublinbike-historical-data-2022-10.csv', '._dublinbike-historical-data-2022-11.csv', '._dublinbike-historical-data-2022-12.csv', 'Dublinbike_historical_data_2022.csv', 'dublinbike-historical-data-2022-01.csv', 'dublinbike-historical-data-2022-02.csv', 'dublinbike-historical-data-2022-03.csv', 'dublinbike-historical-data-2022-04.csv', 'dublinbike-historical-data-2022-05.csv', 'dublinbike-historical-data-2022-06.csv', 'dublinbike-historical-data-2022-07.csv', 'dublinbike-historical-data-2022-08.csv', 'dublinbike-historical-data-2022-09.csv', 'dublinbike-historical-dat

In [6]:
# Creating a list with the files.

months_dfs = ['/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-01.csv', '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-02.csv', '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-03.csv', 
              '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-04.csv', '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-05.csv', '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-06.csv',
              '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-07.csv', '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-08.csv', '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-09.csv',
              '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-10.csv', '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-11.csv', '/home/user/Documents/GitHub/dublin_data/dublinbike-historical-data-2022-12.csv']

In [7]:
df_dublin = pd.DataFrame()

# For loop for merging data
for monthsdfs in months_dfs:
    df = pd.read_csv(monthsdfs)
    df_dublin = pd.concat([df_dublin, df], ignore_index=True)

# Saving a new .csv file with the merged data
df_dublin.to_csv('Dublinbike_historical_data_2022.csv', index=False)

In [8]:
df_dublin = pd.read_csv('/home/user/Documents/GitHub/dublin_data/Dublinbike_historical_data_2022.csv')

Inspect the data.

In [10]:
df_dublin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950289 entries, 0 to 1950288
Data columns (total 11 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   STATION ID             int64  
 1   TIME                   object 
 2   LAST UPDATED           object 
 3   NAME                   object 
 4   BIKE_STANDS            int64  
 5   AVAILABLE_BIKE_STANDS  int64  
 6   AVAILABLE_BIKES        int64  
 7   STATUS                 object 
 8   ADDRESS                object 
 9   LATITUDE               float64
 10  LONGITUDE              float64
dtypes: float64(2), int64(4), object(5)
memory usage: 163.7+ MB


In [11]:
df_dublin.head(3)

Unnamed: 0,STATION ID,TIME,LAST UPDATED,NAME,BIKE_STANDS,AVAILABLE_BIKE_STANDS,AVAILABLE_BIKES,STATUS,ADDRESS,LATITUDE,LONGITUDE
0,2,2022-01-01 00:00:04,2021-12-31 23:57:39,BLESSINGTON STREET,20,10,10,OPEN,Blessington Street,53.3568,-6.26814
1,3,2022-01-01 00:00:04,2021-12-31 23:49:57,BOLTON STREET,20,19,1,OPEN,Bolton Street,53.3512,-6.26986
2,4,2022-01-01 00:00:04,2021-12-31 23:58:39,GREEK STREET,20,9,11,OPEN,Greek Street,53.3469,-6.27298


Handling missing data.

In [13]:
df_dublin.isnull().sum()

STATION ID               0
TIME                     0
LAST UPDATED             0
NAME                     0
BIKE_STANDS              0
AVAILABLE_BIKE_STANDS    0
AVAILABLE_BIKES          0
STATUS                   0
ADDRESS                  0
LATITUDE                 0
LONGITUDE                0
dtype: int64

Filtering the dataset by status.

In [15]:
# Checking unique statuses in 'STATUS' since we are only using stations that are open.

statuses_status = df_dublin['STATUS'].unique()
status_numbers = df_dublin['STATUS'].value_counts()

statuses_status, status_numbers

(array(['OPEN', 'CLOSED'], dtype=object),
 STATUS
 OPEN      1948271
 CLOSED       2018
 Name: count, dtype: int64)

In [16]:
# Filtering the dataset to only 'OPEN' statuses.

df_dublin = df_dublin[df_dublin['STATUS'] == 'OPEN']

Datatype transforming.

In [18]:
# Tranforming the column 'TIME' to datetime format.

df_dublin['TIME']= pd.to_datetime(df_dublin['TIME'])

Dropping, renaming and detecting duplicates.

In [20]:
# Dropping redundant / unecessary columns.

drop_column_dublin_1 = ['LAST UPDATED','ADDRESS', 'STATUS']
df_dublin = df_dublin.drop(columns = drop_column_dublin_1)

In [21]:
# Renaming columns for better vizualisation.

df_dublin.rename(columns={'STATION ID': 'station_id','TIME': 'time', 'NAME': 'station', 'BIKE_STANDS': 'n_stands',
                            'AVAILABLE_BIKE_STANDS': 'available_stands','AVAILABLE_BIKES': 'available_bikes',
                         'LATITUDE': 'lat', 'LONGITUDE': 'long' }, inplace=True)

In [22]:
# Detecting duplicates (It is very unlikely there are duplicates due to the origin of the data, but it will be performed just in case).
# If it is 'False', there are no duplicated rows in the dataset.

dupli = df_dublin.duplicated().any()

print(f'Are there any duplicated rows in the whole dataset? {dupli}')

Are there any duplicated rows in the whole dataset? False


Trasforming Dtypes.

In [24]:
df_dublin.head(5)

Unnamed: 0,station_id,time,station,n_stands,available_stands,available_bikes,lat,long
0,2,2022-01-01 00:00:04,BLESSINGTON STREET,20,10,10,53.3568,-6.26814
1,3,2022-01-01 00:00:04,BOLTON STREET,20,19,1,53.3512,-6.26986
2,4,2022-01-01 00:00:04,GREEK STREET,20,9,11,53.3469,-6.27298
3,5,2022-01-01 00:00:04,CHARLEMONT PLACE,40,17,23,53.3307,-6.26018
4,6,2022-01-01 00:00:04,CHRISTCHURCH PLACE,20,13,7,53.3434,-6.27012


Data engineering for specific data and creation of lighter datasets for diferent analysis.

Dataset: Number of trips

In [27]:
# Sorting rows by station_id and time.

dublin_total_trips = df_dublin.sort_values(by=['station_id', 'time'])

In [28]:
# Calculating number of trips per row (ended and started).
# Obs: There is a slight modifycation in the lambda function since 'ended' and 'started'(lambda x: -x in
# comparison to lambda x: x) as the number of the current available bikes / available stands subtracted from the previous number will 
# dictate if it is a starting or ending trip. (AS + / AB - = Trips started; AS - / AB + = Trips ended)

dublin_total_trips['trips_started'] = dublin_total_trips.groupby('station_id')['available_bikes'].diff().fillna(0)
dublin_total_trips['trips_started'] = dublin_total_trips['trips_started'].apply(lambda x: -x if x < 0 else 0)

dublin_total_trips['trips_ended'] = dublin_total_trips.groupby('station_id')['available_bikes'].diff().fillna(0)
dublin_total_trips['trips_ended'] = dublin_total_trips['trips_ended'].apply(lambda x: x if x > 0 else 0)

dublin_total_trips['total_trips_per_station'] = dublin_total_trips['trips_started']+dublin_total_trips['trips_ended']

In [29]:
dublin_total_trips.head()

Unnamed: 0,station_id,time,station,n_stands,available_stands,available_bikes,lat,long,trips_started,trips_ended,total_trips_per_station
614868,1,2022-04-27 13:00:02,CLARENDON ROW,31,27,0,53.3409,-6.2625,0.0,0.0,0.0
614980,1,2022-04-27 13:30:02,CLARENDON ROW,31,27,0,53.3409,-6.2625,0.0,0.0,0.0
615092,1,2022-04-27 14:00:02,CLARENDON ROW,31,27,0,53.3409,-6.2625,0.0,0.0,0.0
615204,1,2022-04-27 14:30:02,CLARENDON ROW,31,0,0,53.3409,-6.2625,0.0,0.0,0.0
615316,1,2022-04-27 15:00:02,CLARENDON ROW,31,0,0,53.3409,-6.2625,0.0,0.0,0.0


In [30]:
# Calculating totals

dublin_total_trips = dublin_total_trips.groupby(['station_id', 'station'])[['trips_started', 'trips_ended', 'total_trips_per_station']].sum().reset_index()

In [31]:
dublin_total_trips['trips_started'] = dublin_total_trips['trips_started'].astype(int)
dublin_total_trips['trips_ended'] = dublin_total_trips['trips_ended'].astype(int)

# Transforming columns datatypes

dublin_total_trips['total_trips_per_station'] = dublin_total_trips['total_trips_per_station'].astype(int)


In [32]:
dublin_total_trips.tail()

Unnamed: 0,station_id,station,trips_started,trips_ended,total_trips_per_station
110,114,WILTON TERRACE (PARK),13717,13711,27428
111,115,KILLARNEY STREET,10800,10780,21580
112,116,BROADSTONE,5015,5008,10023
113,117,HANOVER QUAY EAST,4691,4689,9380
114,507,ORIEL STREET TEST TERMINAL,0,0,0


Dataset: Stations' locations

In [34]:
dublin_station_location = df_dublin[['station_id', 'station', 'lat', 'long']]

In [35]:
dublin_station_location.head(5)

Unnamed: 0,station_id,station,lat,long
0,2,BLESSINGTON STREET,53.3568,-6.26814
1,3,BOLTON STREET,53.3512,-6.26986
2,4,GREEK STREET,53.3469,-6.27298
3,5,CHARLEMONT PLACE,53.3307,-6.26018
4,6,CHRISTCHURCH PLACE,53.3434,-6.27012


# 1.2. Boston Blue Bikes

The same process for concatenating files will be used for the following Boston Bikes datasets.

Font: https://medium.com/@nawazmohtashim/method-to-merge-csv-files-in-python-8b0f16550e0b

In [38]:
# Listing the files and sorting them by name in the boston_data folder so they do not have to be typed one by one. Adapted from: https://stevenhough.medium.com/how-to-easily-list-all-files-in-a-folder-using-python-3-ee06004c6316

folder_path = '/home/user/Documents/GitHub/boston_data'

file_list = sorted([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

print(file_list)

['.DS_Store', '._.DS_Store', '._202201-bluebikes-tripdata.csv', '._202202-bluebikes-tripdata.csv', '._202203-bluebikes-tripdata.csv', '._202204-bluebikes-tripdata.csv', '._202205-bluebikes-tripdata.csv', '._202206-bluebikes-tripdata.csv', '._202207-bluebikes-tripdata.csv', '._202208-bluebikes-tripdata.csv', '._202209-bluebikes-tripdata.csv', '._202210-bluebikes-tripdata.csv', '._202211-bluebikes-tripdata.csv', '._202212-bluebikes-tripdata.csv', '2022-bluebikes-tripdata.csv', '202201-bluebikes-tripdata.csv', '202202-bluebikes-tripdata.csv', '202203-bluebikes-tripdata.csv', '202204-bluebikes-tripdata.csv', '202205-bluebikes-tripdata.csv', '202206-bluebikes-tripdata.csv', '202207-bluebikes-tripdata.csv', '202208-bluebikes-tripdata.csv', '202209-bluebikes-tripdata.csv', '202210-bluebikes-tripdata.csv', '202211-bluebikes-tripdata.csv', '202212-bluebikes-tripdata.csv']


Creating a list with the files.

In [40]:
boston_months_dfs = ['/home/user/Documents/GitHub/boston_data/202201-bluebikes-tripdata.csv', '/home/user/Documents/GitHub/boston_data/202202-bluebikes-tripdata.csv', '/home/user/Documents/GitHub/boston_data/202203-bluebikes-tripdata.csv',
                     '/home/user/Documents/GitHub/boston_data/202204-bluebikes-tripdata.csv', '/home/user/Documents/GitHub/boston_data/202205-bluebikes-tripdata.csv', '/home/user/Documents/GitHub/boston_data/202206-bluebikes-tripdata.csv', 
                     '/home/user/Documents/GitHub/boston_data/202207-bluebikes-tripdata.csv', '/home/user/Documents/GitHub/boston_data/202208-bluebikes-tripdata.csv', '/home/user/Documents/GitHub/boston_data/202209-bluebikes-tripdata.csv', 
                     '/home/user/Documents/GitHub/boston_data/202210-bluebikes-tripdata.csv', '/home/user/Documents/GitHub/boston_data/202211-bluebikes-tripdata.csv', '/home/user/Documents/GitHub/boston_data/202212-bluebikes-tripdata.csv']

In [41]:
df_boston = pd.DataFrame()

# For loop for merging data

for boston_months_dfs in boston_months_dfs:
    df = pd.read_csv(boston_months_dfs)
    df_boston = pd.concat([df_boston, df], ignore_index=True)

# Saving a new .csv file with the merged data

df_boston.to_csv('/home/user/Documents/GitHub/boston_data/2022-bluebikes-tripdata.csv', index=False)

In [42]:
df_boston = pd.read_csv('/home/user/Documents/GitHub/boston_data/2022-bluebikes-tripdata.csv')

Inspect data.

In [44]:
df_boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3757281 entries, 0 to 3757280
Data columns (total 14 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   tripduration             int64  
 1   starttime                object 
 2   stoptime                 object 
 3   start station id         int64  
 4   start station name       object 
 5   start station latitude   float64
 6   start station longitude  float64
 7   end station id           int64  
 8   end station name         object 
 9   end station latitude     float64
 10  end station longitude    float64
 11  bikeid                   int64  
 12  usertype                 object 
 13  postal code              object 
dtypes: float64(4), int64(4), object(6)
memory usage: 401.3+ MB


In [45]:
df_boston.tail(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code
3757278,958,2022-12-31 23:53:42.9690,2023-01-01 00:09:41.0970,49,Stuart St at Charles St,42.351146,-71.066289,16,Back Bay T Stop - Dartmouth St at Stuart St,42.348074,-71.07657,7898,Customer,2141.0
3757279,995,2022-12-31 23:58:11.1980,2023-01-01 00:14:46.7870,39,Washington St at Rutland St,42.338515,-71.074041,43,Rowes Wharf at Atlantic Ave,42.357143,-71.050699,8330,Subscriber,2116.0
3757280,969,2022-12-31 23:58:30.1460,2023-01-01 00:14:39.4400,39,Washington St at Rutland St,42.338515,-71.074041,43,Rowes Wharf at Atlantic Ave,42.357143,-71.050699,3886,Subscriber,


Handling missing data.

In [47]:
# Since only postal code has null values and it is an irrelevant variable for the analysis, it will be dropped latter.

df_boston.isnull().sum()

tripduration                    0
starttime                       0
stoptime                        0
start station id                0
start station name              0
start station latitude          0
start station longitude         0
end station id                  0
end station name                0
end station latitude            0
end station longitude           0
bikeid                          0
usertype                        0
postal code                465637
dtype: int64

Transforming Dtypes

In [49]:
df_boston['starttime'] = pd.to_datetime(df_boston['starttime'])
df_boston['stoptime'] = pd.to_datetime(df_boston['stoptime'])

Dropping, renaming, and detecting duplicates.

In [51]:
# Dropping redundant / unecessary columns.

drop_column_boston_1 = ['tripduration', 'usertype', 'postal code', 'bikeid'] 
df_boston = df_boston.drop(columns = drop_column_boston_1)

In [52]:
# Renaming columns for better vizualisation.

df_boston.rename(columns={ 'start station id': 'start_station_id', 'start station name': 'start_station', 'start station id': 'start_station_id', 
                          'start station latitude': 'start_station_lat', 'start station longitude': 'start_station_long',
                          
                         'end station id': 'end_station_id', 'end station name': 'end_station', 'end station id': 'end_station_id', 
                          'end station latitude': 'end_station_lat', 'end station longitude': 'end_station_long'}, inplace=True)

In [53]:
df_boston.head(50)

Unnamed: 0,starttime,stoptime,start_station_id,start_station,start_station_lat,start_station_long,end_station_id,end_station,end_station_lat,end_station_long
0,2022-01-01 00:00:25.166,2022-01-01 00:10:22.192,178,MIT Pacific St at Purrington St,42.359573,-71.101295,74,Harvard Square at Mass Ave/ Dunster,42.373268,-71.118579
1,2022-01-01 00:00:40.430,2022-01-01 00:07:32.198,189,Kendall T,42.362428,-71.084955,178,MIT Pacific St at Purrington St,42.359573,-71.101295
2,2022-01-01 00:00:54.818,2022-01-01 00:08:51.668,94,Main St at Austin St,42.375603,-71.064608,356,Charlestown Navy Yard,42.374125,-71.054812
3,2022-01-01 00:01:01.608,2022-01-01 00:08:48.235,94,Main St at Austin St,42.375603,-71.064608,356,Charlestown Navy Yard,42.374125,-71.054812
4,2022-01-01 00:01:06.052,2022-01-01 00:13:38.230,19,Park Dr at Buswell St,42.347241,-71.105301,41,Packard's Corner - Commonwealth Ave at Brighto...,42.352261,-71.123831
5,2022-01-01 00:01:08.500,2022-01-01 00:06:47.531,107,Ames St at Main St,42.3625,-71.08822,68,Central Square at Mass Ave / Essex St,42.36507,-71.1031
6,2022-01-01 00:01:24.749,2022-01-01 00:17:48.185,36,Copley Square - Dartmouth St at Boylston St,42.349928,-71.077392,36,Copley Square - Dartmouth St at Boylston St,42.349928,-71.077392
7,2022-01-01 00:01:49.653,2022-01-01 00:30:17.421,58,Mugar Way at Beacon St,42.355536,-71.072869,58,Mugar Way at Beacon St,42.355536,-71.072869
8,2022-01-01 00:02:30.265,2022-01-01 00:16:27.627,60,Charles Circle - Charles St at Cambridge St,42.360793,-71.07119,363,Harrison Ave at Mullins Way,42.345216,-71.06384
9,2022-01-01 00:02:46.776,2022-01-01 00:11:02.349,178,MIT Pacific St at Purrington St,42.359573,-71.101295,107,Ames St at Main St,42.3625,-71.08822


Data engineering for specific data and creation of lighter datasets for diferent analysis.

Dataset: number of trips

In [56]:
# Calculating the number of trips starting and ending at each station.

trips_started = df_boston.groupby(['start_station_id', 'start_station']).size().reset_index(name='trips_started')
trips_ended = df_boston.groupby(['end_station_id', 'end_station']).size().reset_index(name='trips_ended')

In [57]:
trips_started.tail(5)

Unnamed: 0,start_station_id,start_station,trips_started
456,586,Gramsdorf Playground,146
457,587,Malden High School,108
458,589,North St at Liberty Hill Ave,11
459,590,John Ahern Field at Kennedy-Longfellow School,304
460,591,515 Somerville Ave (Temp. Winter Location),323


In [58]:
trips_ended.tail(5)

Unnamed: 0,end_station_id,end_station,trips_ended
456,586,Gramsdorf Playground,130
457,587,Malden High School,115
458,589,North St at Liberty Hill Ave,7
459,590,John Ahern Field at Kennedy-Longfellow School,282
460,591,515 Somerville Ave (Temp. Winter Location),384


In [59]:
# Merging and getting the total ammount of trips per station.

boston_total_trips = pd.merge(trips_started, trips_ended, left_on = 'start_station_id', right_on = 'end_station_id')

In [60]:
boston_total_trips.head()

Unnamed: 0,start_station_id,start_station,trips_started,end_station_id,end_station,trips_ended
0,1,18 Dorrance Warehouse,37,1,18 Dorrance Warehouse,646
1,3,Colleges of the Fenway - Fenway at Avenue Loui...,9957,3,Colleges of the Fenway - Fenway at Avenue Loui...,10291
2,4,Tremont St at E Berkeley St,14900,4,Tremont St at E Berkeley St,15318
3,5,Northeastern University - North Parking Lot,11861,5,Northeastern University - North Parking Lot,12297
4,6,Cambridge St at Joy St,20750,6,Cambridge St at Joy St,20229


In [61]:
# Cleaning and renaming.

boston_total_trips.rename(columns={'start_station_id': 'station_id', 'start_station': 'station_name'}, inplace=True)
boston_total_trips = boston_total_trips.drop(columns = ['end_station_id', 'end_station'])

In [62]:
# Adding total.

boston_total_trips['total_trips']= boston_total_trips['trips_started']+boston_total_trips['trips_ended']


In [63]:
boston_total_trips.head()

Unnamed: 0,station_id,station_name,trips_started,trips_ended,total_trips
0,1,18 Dorrance Warehouse,37,646,683
1,3,Colleges of the Fenway - Fenway at Avenue Loui...,9957,10291,20248
2,4,Tremont St at E Berkeley St,14900,15318,30218
3,5,Northeastern University - North Parking Lot,11861,12297,24158
4,6,Cambridge St at Joy St,20750,20229,40979


Dataset: Stations' locations

In [65]:
# Separating start and end stations and renaming columns.

start_stations = df_boston[['start_station_id', 'start_station', 'start_station_lat', 'start_station_long']]
start_stations.columns = ['station_id', 'station', 'lat', 'long']

end_stations = df_boston[['end_station_id', 'end_station', 'end_station_lat', 'end_station_long']]
end_stations.columns = ['station_id', 'station', 'lat', 'long']

# Concatenating datasets.

boston_station_location = pd.concat([start_stations, end_stations])

# Dropping duplicates.

boston_station_location = boston_station_location.drop_duplicates().reset_index(drop=True)

In [66]:
boston_station_location = boston_station_location.sort_values(by='station_id').reset_index(drop=True)

In [67]:
boston_station_location.head(5)

Unnamed: 0,station_id,station,lat,long
0,1,18 Dorrance Warehouse,42.387151,-71.075978
1,3,Colleges of the Fenway - Fenway at Avenue Loui...,42.340115,-71.100619
2,4,Tremont St at E Berkeley St,42.345392,-71.069616
3,5,Northeastern University - North Parking Lot,42.341814,-71.090179
4,6,Cambridge St at Joy St,42.361257,-71.065287


# 1.3. Data for sentiment analysis

# Fetching Data.

In [70]:
from dotenv import load_dotenv
load_dotenv()
import praw
import json

Setting parameters

In [72]:
# Getting values.

client_id =os.getenv('CLIENT_ID')  
client_secret =os.getenv('CLIENT_SECRET')   
user_agent =os.getenv('USER_AGENT')

# Verifying variables (this section should be commented out or delete in producting for safety reasons).

print(client_id)
print(client_secret)
print(user_agent)

cSb8r3IMxXZYe3cUf7mrdQ
CFLt2M6lt7bXXKKEiWUfQoVFsSZIeg
BikeSharingScraper by /u/Raphacae


In [73]:
# Initializing instances

reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)

# Def for collecting comments

def getting_comments(subreddit_name, query, limit=1000):
    subreddit = reddit.subreddit(subreddit_name)
    comments_data = []

    for submission in subreddit.search(query, limit=limit):
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            comments_data.append({'comment_id':comment.id, 'comment_body': comment.body, 'comment_score':comment.score, 
                                  'comment_author':str(comment.author), 'submission_title': submission.title, 
                                  'submission_id': submission.id, 'submission_id':submission.id, 'submission_score':submission.score,
                                  'submission_author': str(submission.author), 'submission_url':submission.url})


    return comments_data

Fetching, storing, and reading comments.

In [None]:

dublin_comments = getting_comments('all', 'DublinBikes', limit=500)
boston_comments = getting_comments('all', 'BlueBikes Boston', limit=500)


In [None]:
# Retrieval and storage of comments. An 'else' statment was added in order to prevent writing empty data to the .json file.


if dublin_comments:
    with open('dublin_comments.json', 'w') as f:
        json.dump(dublin_comments, f, indent=4)
else:
    print("DublinBike: No comments fetched.")

if boston_comments:
    with open('boston_comments.json', 'w') as f:
        json.dump(boston_comments, f, indent=4)
else:
    print("BlueBikes Boston: No comments fetched.")

In [None]:
with open('dublin_comments.json', 'r') as f:
    dublinbikes_json = json.load(f)
    print("DublinBikes Comments JSON:", dublinbikes_json[:5])  # Print first 5 comments to verify

with open('boston_comments.json', 'r') as f:
    bluebikes_boston_json = json.load(f)
    print("BlueBikes Boston Comments JSON:", bluebikes_boston_json[:5])  # Print first 5 comments to verify

In [None]:
# Reading json files

df_dublin_comments = pd.read_json('dublin_comments.json')
df_boston_comments = pd.read_json('boston_comments.json')

1.3.1. Dublin Reddit Comments.

Inspecting the data.

In [None]:
df_dublin_comments.info()

In [None]:
df_dublin_comments.head(1)

Handling missing data.

In [None]:
df_dublin_comments.isnull().sum()

Detecting duplicates and selecting columns.

In [None]:
# Detecting duplicates (It is very unlikely there are duplicates due to the origin of the data, but it will be performed just in case).
# If it is 'False', there are no duplicated rows in the dataset.

dupli_dublin_comments = df_dublin_comments.duplicated().any()

print(f'Are there any duplicated rows in the whole dataset? {dupli_dublin_comments}')

# 2 - EDA (Exploratory Data Analysis)

# 2.1. General numbers

2.1.1 - Dublin - General numbers concerning the dataset.

In [None]:
# Counting basic numbers.

unique_dublin_stations = df_dublin['station_id'].nunique()
n_bikes_pstation_dublin = df_dublin.groupby('station_id')['n_stands'].max()
total_bikes_dublin = n_bikes_pstation_dublin.sum()

print('Total number of stations: ', unique_dublin_stations)
print('Number of available bikes per station:\n ', n_bikes_pstation_dublin)

print('Total number of bikes available in Dublin: ', total_bikes_dublin)

2.1.2. Spacial Vizualisation

In [None]:

#import plotly.express as px

#dublin_distribution = px.scatter_mapbox(df_dublin, lat='lat', lon='long', color_discrete_sequence=["green"], zoom=12, height=600)

#dublin_distribution.update_layout(mapbox_style="open-street-map")
#dublin_distribution.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

#dublin_distribution.show() 

