#Proyek Analisis Data: Bike Sharing Dataset

Nama: Suryansyah Suciono

Email: sucionosuryansyah@gmail.com

Id Dicoding: suryansyahsuciono

#Menentukan Pertanyaan Bisnis

pertanyaan 1: Bagaimana performa total transaksi berdasarkan bulan dan musim?

pertanyaan 2: top 5 performa terbaik dan terburuk total transaksi berdasarkan jam serta tren perbandingan antara casual dan registered berdasarkan jam?

#Menyiapkan semua library yang dibutuhkan

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import json

#Data Wrangling

##Gathering Data

In [None]:
hour_df = pd.read_csv('E:/data analytics/python/Bike-sharing-dataset/hour.csv')

In [None]:
day_df = pd.read_csv('E:/data analytics/python/Bike-sharing-dataset/day.csv')

##Assessing Data

###Menilai tabel hour_df

In [None]:
hour_df.head()

In [None]:
hour_df.info()

In [None]:
print("Jumlah duplikasi: ", hour_df.duplicated().sum())

In [None]:
hour_df.describe()

###menilai tabel day_df

In [None]:
day_df.head()

In [None]:
day_df.info()

In [None]:
print("Jumlah duplikasi: ", day_df.duplicated().sum())

In [None]:
day_df.describe()

##Cleaning Data

In [None]:
datetime_columns = ["dteday"]

for column in datetime_columns:
  hour_df[column] = pd.to_datetime(hour_df[column])

In [None]:
datetime_columns = ["dteday"]

for column in datetime_columns:
  day_df[column] = pd.to_datetime(day_df[column])

#Exploratory Data Analysis (EDA)

##Explore hour_df

In [None]:
hour_df.sample(5)

In [None]:
hour_df.describe(include="all")

In [None]:
hour_df.groupby(by=["hr", "weekday"]).agg({
    "instant": "nunique",
    "casual": "sum",
    "registered": "sum",
    "cnt": "sum"
}).sort_values(by="cnt", ascending=False)

##Explore day_df

In [None]:
day_df.sample(5)

In [None]:
day_df.describe(include="all")

In [None]:
day_df.groupby(by="workingday").agg({
    "instant": "nunique",
    "casual": "sum",
    "registered": "sum",
    "cnt": "sum"
}).sort_values(by="cnt", ascending=False)

#Visualization & Explanatory Analysis

##Bagaimana performa total transaksi berdasarkan bulan dan musim?

In [None]:
sum_season_cnt_df = day_df.groupby("season").cnt.sum().sort_values(ascending=False).reset_index()
sum_season_cnt_df.head(5)

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(
    y="cnt", 
    x="season",
    data=sum_season_cnt_df,
    order=sum_season_cnt_df.sort_values(by="cnt", ascending=False)["season"],
    palette=colors_
)
plt.title("Count of Total by Season", loc="center", fontsize=15)
plt.ylabel(None)
plt.xlabel(None)
plt.tick_params(axis='x', labelsize=12)
plt.show()

In [None]:
sum_mnth_cnt_df = day_df.groupby("mnth").cnt.sum().sort_values(ascending=False).reset_index()
sum_mnth_cnt_df.head(12)

In [None]:
plt.figure(figsize=(10, 5))
colors_ = ["#72BCD4", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3"]
sns.barplot(
    x="mnth", 
    y="cnt",
    data=sum_mnth_cnt_df,
    order=sum_mnth_cnt_df.sort_values(by="cnt", ascending=False)["mnth"],
    palette=colors_
)
plt.title("Count of Total by Month", loc="center", fontsize=15)
plt.ylabel(None)
plt.xlabel(None)
plt.tick_params(axis='y', labelsize=12)
plt.show()

##top 5 performa terbaik dan terburuk total transaksi berdasarkan jam serta tren perbandingan antara casual dan registered berdasarkan jam?

In [None]:
sum_hr_cnt_df = hour_df.groupby("hr").cnt.sum().sort_values(ascending=False).reset_index()
sum_hr_cnt_df.head(24)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(24, 6))
colors = ["#72BCD4", "#D3D3D3", "#D3D3D3", "#D3D3D3", "#D3D3D3"]

sns.barplot(x="hr", y="cnt", data=sum_hr_cnt_df,
            order=sum_hr_cnt_df.sort_values(by="cnt", ascending=False)["hr"].head(5), palette=colors, ax=ax[0])
ax[0].set_ylabel(None)
ax[0].set_xlabel(None)
ax[0].set_title("Top Performing Hour", loc="center", fontsize=18)
ax[0].tick_params(axis ='y', labelsize=15)

sns.barplot(x="hr", y="cnt", data=sum_hr_cnt_df,
            order=sum_hr_cnt_df.sort_values(by="cnt", ascending=True)["hr"].head(5), palette=colors, ax=ax[1])
ax[1].set_ylabel(None)
ax[1].set_xlabel(None)
ax[1].set_title("Worst Performing Hour", loc="center", fontsize=18)
ax[1].tick_params(axis='y', labelsize=15)
plt.suptitle("Top and Worst Performing Hour by Count of Total", fontsize=20)
plt.show()

In [None]:
sum_hr_casreg_df = hour_df.groupby("hr").agg({
    "casual": "sum",
    "registered": "sum"
}).reset_index()
sum_hr_casreg_df.head(24)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(
    sum_hr_casreg_df["hr"],
    sum_hr_casreg_df["casual"],
    marker='o', 
    linewidth=2,
    color="#FFA500"
)
plt.plot(
    sum_hr_casreg_df["hr"],
    sum_hr_casreg_df["registered"],
    marker='o', 
    linewidth=2,
    color="#008000"
)
plt.title("Comparison of Casual and Registered per Hour (2011-2012)", loc="center", fontsize=20)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

#Conclusion

##pertanyaan 1:

- bisa disimpulkan bahwa performa total transaksi berdasarkan musim yaitu dimana musim ke 3(autumn) memiliki performa tertinggi sedangkan musim ke 1(spring) memiliki performa terendah
- untuk total transaksi berdasarkan bulan yaitu dimana bulan ke 8 memiliki performa terbaik sedangkan bulan ke 1 memiliki performa terendah

##pertanyaan 2:

- top 5 terbaik diurutkan dari jam 17, 18, 8, 16, 19 dan top 5 terburuk diurutkan dari jam 4, 3, 5, 2, 1
- dapat dilihat bahwa jumlah dari registered terdapat peningkatan tinggi dari rentang jam 6-8 dan 15-17 dan sedangkan untuk casual terjadi peningkatan dari jam 7 - 13