In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# PREPARING & CLEANING THE DATA 

In [None]:
# Load Packages
library(readr)
library(dplyr)
library(lubridate)
library(skimr)
library(janitor)
library(ggplot2)
library(lubridate)
library(geosphere)

In [None]:
#Loading dataset
divvy_1 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202201-divvy-tripdata/202201-divvy-tripdata.csv")
divvy_2 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202202-divvy-tripdata/202202-divvy-tripdata.csv")
divvy_3 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202203-divvy-tripdata/202203-divvy-tripdata.csv")
divvy_4 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202204-divvy-tripdata/202204-divvy-tripdata.csv")
divvy_5 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202205-divvy-tripdata/202205-divvy-tripdata.csv")
divvy_6 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202206-divvy-tripdata/202206-divvy-tripdata.csv")
divvy_7 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202207-divvy-tripdata/202207-divvy-tripdata.csv")
divvy_8 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202208-divvy-tripdata/202208-divvy-tripdata.csv")
divvy_9 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202209-divvy-tripdata/202209-divvy-publictripdata.csv")
divvy_10 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202210-divvy-tripdata/202210-divvy-tripdata.csv")
divvy_11 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202211-divvy-tripdata/202211-divvy-tripdata.csv")
divvy_12 <- read.csv("/kaggle/input/cyclist-datasets-remote-2022/202212-divvy-tripdata/202212-divvy-tripdata.csv")

In [None]:
str(divvy_1)
str(divvy_2)
str(divvy_3)
str(divvy_4)
str(divvy_5)
str(divvy_6)
str(divvy_7)
str(divvy_8)
str(divvy_9)
str(divvy_10)
str(divvy_11)
str(divvy_12)


> 13 different variables from 12 datasets have similar formats. We can proceed to merge the datasets.* 

In [None]:
bike_rides <- rbind(divvy_1, divvy_2, divvy_3, divvy_4, divvy_5,divvy_6, divvy_7, divvy_8, divvy_9, divvy_10, divvy_11, divvy_12)

In [None]:
bike_rides <- bike_rides[!duplicated(bike_rides$ride_id), ]
print(paste("Removed", nrow(bike_rides) - nrow(bike_rides), "duplicated rows"))

In [None]:
#dropping NA cells
bike_rides <- na.omit(bike_rides)


In [None]:
summary(bike_rides)

In [None]:
# converting to date time format,

bike_rides$Ymd  <- as.Date(bike_rides$started_at)

#added columns for looking into the date of the travel,start and end hour.

bike_rides$started_at <- lubridate::ymd_hms(bike_rides$started_at)
bike_rides$ended_at <- lubridate::ymd_hms(bike_rides$ended_at)

bike_rides$start_hour <- lubridate::hour(bike_rides$started_at)
bike_rides$end_hour <- lubridate::hour(bike_rides$ended_at)



In [None]:
bike_rides$ride_length <- as.numeric(bike_rides$ended_at - bike_rides$started_at) / 60

In [None]:
summary(bike_rides$ride_length)

In [None]:
bike_rides <- bike_rides %>% filter(ride_length > 0) #filtered out negative values

In [None]:
bike_rides$day_of_week <- format (as.Date(bike_rides$Ymd), "%A") #to know the day and month of the ride to analys weekly and monthly usage of bikes
bike_rides$month <- strftime(bike_rides$started_at, "%B")

In [None]:
head(bike_rides)

In [None]:
bike_rides$ride_distance <- distGeo(matrix(c(bike_rides$start_lng, bike_rides$start_lat), ncol = 2), matrix(c(bike_rides$end_lng, bike_rides$end_lat), ncol = 2))
bike_rides$ride_distance <- bike_rides$ride_distance/1000

In [None]:
head(bike_rides)

In [None]:
str(bike_rides)

#  ANALYZING THE DATA

In [None]:
ggplot(bike_rides, aes(member_casual, fill=member_casual)) +
      geom_bar() +
      labs(x="", title="CASUAL VS MEMBER")



In [None]:
bike_rides %>% group_by(member_casual) %>% summarise(count = length(ride_id))

**Members are dominating the Cyclist App**



In [None]:
ggplot(bike_rides, aes(day_of_week, fill=member_casual)) +
     geom_bar(,position=position_dodge()) +
     labs(x="", tle="WEEKLY RIDES")

**WEELY ANALYSIS**
> The data says that casual riders are likely to take rides on Saturday & Sunday. On the other hand, members are less likely to take rides on Sunday than other weekdays. 


In [None]:

ggplot(bike_rides, aes(month, fill=member_casual)) +
    geom_bar(,position=position_dodge()) +
    labs(x="months", title="MONTHLY RIDES")

In [None]:
bike_rides$day_of_week <- ordered(bike_rides$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

bike_rides$month <- ordered(bike_rides$month, levels=c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))

>  Order the days of the week. Will also order the month


In [None]:

ggplot(bike_rides, aes(month, fill=member_casual)) +
    geom_bar(,position=position_dodge()) +
    labs(x="Month",y="Total Rides", title="MONTHLY RIDES") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

**MONTHLY ANALYSIS**

> Both Casual and Member riders are taking most rides from May to October.  Riders are less active rest of the year. **

In [None]:
bike_rides %>%
    ggplot(aes(start_hour, fill=member_casual)) +
    labs(x="Hour of the Day", title="ACTIVE SESSION OF THE DAY") +
    geom_bar(position=position_dodge())

**DAILY ANALYSIS**

> Members are usually started to get active during early morning where casuals are expected to be active during middle of the day. Both riders are most active during evening.

# Monthly and Weekly ride distribution

In [None]:
member_rides <-  filter(bike_rides, member_casual == "member")

In [None]:
member_rides %>% group_by(rideable_type, month) %>%  
    ggplot(aes(month, fill=rideable_type)) +  geom_bar(position=position_dodge()) +
    labs(title="Number of Member Rides by Month", x = "Month", y = "Number of Member Rides") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
   

In [None]:
member_rides %>% group_by(rideable_type, day_of_week) %>%  
    ggplot(aes(day_of_week, fill=rideable_type)) +  geom_bar(position=position_dodge()) +
    labs(title="Number of Member Rides Weekly", x = "", y = "Number of Member Rides") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
   

> Members prefer riding both classic and electric bike but classic bikes are more used. 

In [None]:
casual_rides <-  filter(bike_rides, member_casual == "casual")

In [None]:
casual_rides %>% group_by(rideable_type, month) %>%  
    ggplot(aes(month, fill=rideable_type)) +  geom_bar(position=position_dodge()) +
    labs(title="Number of CASUAL Rides by Month", x = "Month", y = "Number of Casual Rides") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
   

In [None]:
casual_rides %>% group_by(rideable_type, day_of_week) %>%  
    ggplot(aes(day_of_week, fill=rideable_type)) +  geom_bar(position=position_dodge()) +
    labs(title="Number of Casual Rides Weekly", x = "", y = "Number of Casual Rides") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
   

Casual riders using both electric and classic bikes far more than docked bikes. The usage of electric and classic bikes is identical but from the month of June there is a peak rise of electrice bikes. 

#  Analysis on Ride Length (Minutes)

In [None]:

mean(bike_rides$ride_length) #straight average 
median(bike_rides$ride_length) #midpoint number in the ascending array of ride lengths
max(bike_rides$ride_length) #longest ride
min(bike_rides$ride_length) #shortest ride


summary(bike_rides$ride_length)

In [None]:
# Compare members and casual users
bike_rides %>% 
  group_by(member_casual) %>% 
  summarise(avg_lenth = mean(ride_length), median_length = median(ride_length), max_length = max(ride_length), min_length = min(ride_length))

In [None]:
aggregate(bike_rides$ride_length ~ bike_rides$member_casual + bike_rides$day_of_week, FUN = mean)

# Popular Stations 

In [None]:
head(bike_rides)

In [None]:
#Create a new column with route on the casual riders data frame

casual_rides <- casual_rides %>%
  mutate(route = paste(start_station_name, "To", sep=" "))
         
         
casual_rides <- casual_rides %>%       
  mutate(route = paste(route, end_station_name, sep =" "))

In [None]:
#Find the most popular route by number of casual rides

popular_routes <- casual_rides %>% 
  group_by(route) %>%
  summarise(number_of_rides  = n(), average_duration_minutes = mean(ride_length)) %>% 
  arrange(route, number_of_rides, average_duration_minutes)

route_top10 <- arrange(popular_routes, desc(number_of_rides))



In [None]:
head(route_top10, 10)

# FINAL REPORT

1. The average ride length  of members is only 12 minutes wherecasual riders ride 22 minutes on average which is 83.33% higher than members. Therefore, there are great potential to turn casual riders into members.






2. The best period to run campaigns, promotions and advirtisements from May to October. Because most rides are taken by r riders during this period. 

3.Top 5 places to run marketing campaigns:
* "Streeter Dr & Grand Ave" 
* "DuSable Lake Shore Dr & Monroe St"  
* "Michigan Ave & Oak St"
* "Millennium Park"
* "Montrose Harbor"

4. Casual riders like to ride on weekends (Saturday, Sunday). On the other hand, members are less like to ride on those two days. Both casual and members is mostly active during evening. Therefore, casual riders who are active except weekends could buy membership.

5. Casual riders like to ride electric bikes more than others from July to December. Feb to June the usage of electirc and classic are almost same. 