In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML
pd.set_option('display.max_rows', None)
sns.set_style("dark")

### This analysis is about bike sharing data for "Capital Bikeshare" company for 2011-2012.


### About Capital Bikeshare
Capital Bikeshare is metro DC's bikeshare system, with more than 5,000 bikes available at 600 stations across seven jurisdictions: Washington, DC; Arlington, VA; Alexandria, VA; Montgomery County, MD; Prince George's County, MD; Fairfax County, VA; and the City of Falls Church, VA. Capital Bikeshare provides residents and visitors with a convenient, fun and affordable transportation option for getting from Point A to Point B.
https://ride.capitalbikeshare.com/

The data using in this analysis come from kaggle site.
https://www.kaggle.com/competitions/bike-sharing-demand/data

### Roadmap 
The first we explore the data(train data) to understand all data and its features and extract some intuition and maybe some knowledge then use them to prediction.

## Reading data

In [None]:
trdf = pd.read_csv('../input/bike-sharing-demand/train.csv')
tedf = pd.read_csv('../input/bike-sharing-demand/test.csv')
trdf.rename(columns={'count':'total number'},inplace=True);

In [None]:
trdf['datetime'] = pd.to_datetime(trdf['datetime']) 
tedf['datetime'] = pd.to_datetime(tedf['datetime'])
trdf.info()
print('-----------------------------------------------------------')
tedf.info()

In [None]:
trdf.describe()

### Nmuber of every season data

In [None]:
seasondf = trdf[['season','casual','registered','total number']].groupby(['season'],as_index=False).sum()
seasondf['season']=['spring','summer','fall','winter']
seasondf

In [None]:
# barchar function for seasondf
def BarChart(data,y:str,x:str,title:str,colorCond:int=600000):
    #ax = sns.barplot(data=seasondf,x=seasondf.index,y='count',palette=['red','gray','green','gray'])
    ax = sns.barplot(data=data,x=x,y=y)
    #print values for each bars.
    for i in ax.containers:
        ax.bar_label(i,)
    #remove additional components from the figure.
    sns.despine(top = True, left = True)
    ax.set(yticklabels=[])
    ax.tick_params(left=False)
    # All bars become gray except one.
    for bar in ax.patches:
        if bar.get_height() > colorCond:
            bar.set_color('red')    
        else:
            bar.set_color('grey')

    ax.set(title=title);
    

In [None]:
title='Number of total rentals across every seasons in 2011 and 2012 '
BarChart(seasondf,y='total number',x='season',title=title,colorCond=600000)

In [None]:
title='Number of total "non registered" rentals across every seasons in 2011 and 2012 '
BarChart(seasondf,y='casual',x='season',title=title,colorCond=130000)

In [None]:
title='Number of total "registered" rentals across every seasons in 2011 and 2012 '
BarChart(seasondf,y='registered',x='season',title=title,colorCond=470000)

### Number of rentals in every seasons for casual or registered.

We show that in winter season rental number is maximum and spring season we had minimum number.

In [None]:
# function for ploting bars in different groups.
def barplotGroups(data,x,y,hue,title,legendtitle=''):
    ax=sns.barplot(data=data,x=x,y=y,hue=hue)
    #print values for each bars.
    for i in ax.containers:
        ax.bar_label(i,)

    #remove additional components from the figure.
    sns.despine(top = True, left = True)
    ax.set(yticklabels=[])
    ax.tick_params(left=False)
    ax.legend(title=legendtitle,loc='upper left')
    ax.set(title=title)
    return ax

In [None]:
df=seasondf.reset_index().melt(id_vars='season',var_name='count')
df.rename(columns={'value':'Number'},inplace=True)
title = 'Number of bike rental for every seaon for 2 years 2011-2012'
plt.figure(figsize=(15,6))
barplotGroups(data=df,x='season',y='Number',hue='count',title=title);

### what about weather in every season. which season had mostly rainy or snowy.

In [None]:
seasonWeather = trdf.drop(['datetime','holiday','workingday'],axis=1)
seasonWeather=seasonWeather.groupby(['season','weather']).agg({'temp':np.mean,'atemp':np.mean,'humidity':np.mean,
                                                 'windspeed':np.mean,'casual':np.sum,'registered':np.sum,
                                                 'total number':np.sum})
seasonWeather.index = seasonWeather.index.set_levels([['spring','summer','fall','winter'],['Clear','Little','Light_S_R','Heavy_S_R']])
seasonWeather.reset_index(inplace=True)
print('Calculate mean and sum of features for every season and weathers.')
seasonWeather = seasonWeather.round(1)
seasonWeather

### The below bar graph have a interesting point that spring season has a lowest temprature in all weather state than other seasons.

In [None]:
title = 'Average temprature for each season in every weather state for 2011-2012.'
plt.figure(figsize=(15,6))
barplotGroups(data=seasonWeather,x='season',y='temp',hue='weather',title=title,legendtitle='Weather');

Average different of measured temprature with "feels like" temperature.

In [None]:
diferentTmp = trdf['atemp'] - trdf['temp']
print(f'Average different of temp and atemp for all rows: {diferentTmp.mean().round(1)} C.')
print('"feels like" temperature larger than the measured temperature!')


### About humidity and season.

In [None]:
title = 'Average humidity for each season in every weather state for 2011-2012.'
plt.figure(figsize=(15,6))
ax=barplotGroups(data=seasonWeather,x='season',y='humidity',hue='weather',title=title,legendtitle='Weather')
ax.legend(title='Weather',loc='lower right');

### About windspeed and season.

In [None]:
title = 'Average windspeed for each season in every weather state for 2011-2012.'
plt.figure(figsize=(12,6))
ax=barplotGroups(data=seasonWeather,x='season',y='windspeed',hue='weather',title=title,legendtitle='Weather')
ax.legend(title='Weather',loc='lower right');

# -----------------------------------------------------------------------------------------------------------------------------------

### Explore to every variables and their charactristics.

#### A little exploratory about time for every samples.

The training set has hourly rental data spanning two years.the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month.

In [None]:
datedf = pd.DataFrame(trdf['datetime'])
datedf['year'] = datedf['datetime'].dt.year
datedf['month'] = datedf['datetime'].dt.month
datedf['day'] = datedf['datetime'].dt.day
datedf['hour'] = datedf['datetime'].dt.hour

df = datedf.drop(columns='datetime').groupby(['year','month','day'],as_index=False).count()
print('Our train dataset have two year and 12 months with 19 days every month.')
print('I count days that have more than 20 hours record data for every month.')

# Build a new column based on more than 20 hours of record days.
# Column for days that have a record of more than 20 hours 1 and days less than 0..
df2 = df.assign(daysMore20H=np.where(df['hour']>=20,1,0))

# Grouping base on year and month and counting days.
#df2.groupby(['year','month']).agg({'day':['count'],'daysMore20H':['sum']})
df2.groupby(['year','month']).agg(CountDays=('day','count'),DaysMore20H=('daysMore20H','sum'))

#### we just have one day that have less than 20 hour record.

In [None]:
df[df['hour']<20]

#### season

In [None]:
season = trdf['season'].value_counts(dropna=False)
print('Season variable as categorical data is a nominal variable.')
print('1 -> spring. 2 -> summer. 3 -> fall. 4 -> winter.')
print(f'Count of all rows is {season.sum()}.')
print('----------------------------------------------------')
print('Seasons:')
season

#### holiday

In [None]:
holiday = trdf['holiday'].value_counts(dropna=False)
print('Holiday variable as categorical data is a nominal variable.')
print('1 -> holiday and 0 -> non holiday')
print(f'Count of all rows is {holiday.sum()}.')
print('----------------------------------------------------')
print('Holiday:')
holiday

#### workingday

Workingday - whether the day is neither a weekend nor holiday

In [None]:
workingday = trdf['workingday'].value_counts(dropna=False)
print('Workingday variable as categorical data is a nominal variable.')
print('1 -> workingday and 0 -> non workingday')
print(f'Count of all rows is {workingday.sum()}.')
print('----------------------------------------------------')
print('Workingday:')
workingday

### weather

In [None]:
weather = trdf['weather'].value_counts(dropna=False)
print('Weather variable as categorical data is a nominal variable.')
weatherExplain = '''
Weather:
1: Clear, Few clouds, Partly cloudy, Partly cloudy.
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist.
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds.
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog.
'''
print(weatherExplain)
print(f'Count of all rows is {workingday.sum()}.')
print('----------------------------------------------------')
print('Weather:')
print(weather)
print('-----------------------------------------------------------')
print('Weather in every season for every year.')
df = trdf[['datetime','season','weather']].assign(year=trdf['datetime'].dt.year)
# grouping base year season and weather and cout rows relatd for each weather.
df.groupby(['year','season','weather']).agg(CountData=('datetime','count'))

In [None]:
print('4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog.')
trdf[trdf['weather']==4]

### temp
Temperature in Celsius

In [None]:
trdf['temp'].describe()

In [None]:
# function for some plot and explore countinuos variable.
def seasonHist(data,name,someExplain,var):
    print(f'Histogram of {name} for each season.')
    print(someExplain)
    fig,axes=plt.subplots(1,4,figsize=(20,4),sharex=True,sharey=True)
    ax1=sns.histplot(data[data['season']==1][var],kde=True,ax=axes[0],bins=12,stat='proportion')
    ax2=sns.histplot(data[data['season']==2][var],kde=True,ax=axes[1],bins=12,stat='proportion')
    ax3=sns.histplot(data[data['season']==3][var],kde=True,ax=axes[2],bins=12,stat='proportion')
    ax4=sns.histplot(data[data['season']==4][var],kde=True,ax=axes[3],bins=12,stat='proportion')
    ax1.set_title('Spring')
    ax2.set_title('Summer')
    ax3.set_title('Fall')
    ax4.set_title('Winter');

# A function for plot Average variable for every hour in each season.'
def seasonScatterHour(data,explain,var,legend=False,textAnnot = True):
    df = data.groupby(['season','hour'],as_index=False).agg(avg =(var,'mean'))
    plt.figure(figsize=(12,6))
    palette=sns.color_palette("tab10",4)
    ax=sns.scatterplot(data=df,x='hour',y='avg',hue='season',palette=palette,s=70,legend=legend )
    ax.set_title(explain)
    if textAnnot ==True:
        ax.text(-0.5, df[df['season']==1]['avg'].iloc[0]+1,'Spring', fontsize=12)
        ax.text(-0.5, df[df['season']==2]['avg'].iloc[0]+1,'Summer', fontsize=12)
        ax.text(-0.5, df[df['season']==3]['avg'].iloc[0]+1,'Fall', fontsize=12)
        ax.text(-0.5, df[df['season']==4]['avg'].iloc[0]+1,'Winter', fontsize=12)
        ax.set_xticks(range(0,24));
        maxtmp = df.groupby(['season'],as_index=False).agg(maxtmp=('avg','max'))
        hourmaxtmp = df[df['avg'].isin(maxtmp['maxtmp'])].round(1)

        for i,s in hourmaxtmp.iterrows():
            ax.annotate(s['avg'], xy=(s['hour'], s['avg']), xytext=(s['hour']-1, s['avg']-2),
                        arrowprops=dict(arrowstyle="->",color='black',connectionstyle="angle3,angleA=0,angleB=-90")) 
    return ax

In [None]:
temp = trdf[['datetime','season','temp']]
print('temp variable as numeric data is a interval variable.')
print(f"Count of all rows is {temp['temp'].count()}.")
print('--------------------------------------------------------------')
#extract year and month.
temp = temp.assign(hour = temp.datetime.dt.hour, month = temp.datetime.dt.month ).drop(columns=['datetime'])
someExplain = 'It shows that Spring on average is colder than other seasons.'
seasonHist(temp,'temprature',someExplain,'temp')

In [None]:
avgTempSeason = temp.groupby('season',as_index=False).agg(avg_temp=('temp','mean')).round(1)
avgTempSeason['season']=['Spring','Summer','Fall','Winter']
avgTempMonth = temp.groupby('month',as_index=False).agg(avg_temp=('temp','mean')).round(1)
plt.figure()
BarChart(data=avgTempSeason,x='season',y='avg_temp',title='Average of Temprature for every season.',colorCond=25)

In [None]:
plt.figure()
BarChart(data=avgTempMonth,x='month',y='avg_temp',title='Average of Temprature for every month.',colorCond=30)

In [None]:
seasonScatterHour(temp,'Average temprature for every hour in each season.','temp')

### atemp
#### "feels like" temperature in Celsius.
I think this variable is like "temp" var but alittle difference.

In [None]:
trdf['atemp'].describe()

In [None]:
atemp = trdf[['datetime','season','atemp']]
print('atemp variable as numeric data is a interval variable.')
print(f"Count of all rows is {atemp['atemp'].count()}.")
print('--------------------------------------------------------------')
#extract year and month.
atemp = atemp.assign(hour = atemp.datetime.dt.hour, month = atemp.datetime.dt.month ).drop(columns=['datetime'])
someExplain = 'It shows that Spring on average is colder than other seasons.'
seasonHist(atemp,'feel-like temprature',someExplain,'atemp')

In [None]:
avgatempSeason = atemp.groupby('season',as_index=False).agg(avg_temp=('atemp','mean')).round(1)
avgatempSeason['season']=['Spring','Summer','Fall','Winter']
avgatempMonth = atemp.groupby('month',as_index=False).agg(avg_temp=('atemp','mean')).round(1)
plt.figure()
BarChart(data=avgatempSeason,x='season',y='avg_temp',title='Average of "feels like" temprature for every season.',colorCond=25)

In [None]:
plt.figure()
BarChart(data=avgatempMonth,x='month',y='avg_temp',title='Average of "feels like" temprature for every month.',colorCond=30)

In [None]:
seasonScatterHour(atemp,'Average feel-like temprature for every hour in each season.','atemp')

### humidity - relative humidity

The term relative humidity (RH) expresses the relationship between the moisture content of air at a certain temperature and the moisture content of moisture-saturated air at the same temperature.\
It is given as a percentage from 0 to 100.\
0% RH means absolutely dry air, zero moisture content.\
<br>
[link1](https://www.sciencedirect.com/topics/agricultural-and-biological-sciences/relative-humidity#:~:text=The%20term%20relative%20humidity%20(RH,percentage%20from%200%20to%20100.)
<br>
[link2](https://www.lenntech.com/calculators/humidity/relative-humidity.htm)
<br>
[link3](https://en.wikipedia.org/wiki/Humidity#Relative_humidity)


In [None]:
trdf['humidity'].describe()

In [None]:
humidity = trdf[['datetime','season','humidity']]
print('humidity variable as numeric data is a ratio variable.')
print(f"Count of all rows is {humidity['humidity'].count()}.")
print('--------------------------------------------------------------')
#extract year and month.
humidity = humidity.assign(hour = humidity.datetime.dt.hour, month = humidity.datetime.dt.month ).drop(columns=['datetime'])
someExplain = 'It shows that humidity increase relatively from spring to winter.'
seasonHist(humidity,'relative humidity',someExplain,'humidity')

In [None]:
avghumiditySeason = humidity.groupby('season',as_index=False).agg(avg_humidity=('humidity','mean')).round(1)
avghumiditySeason['season']=['Spring','Summer','Fall','Winter']
avghumidityMonth = humidity.groupby('month',as_index=False).agg(avg_humidity=('humidity','mean')).round(1)
plt.figure()
BarChart(data=avghumiditySeason,x='season',y='avg_humidity',title='Average of humidity for every season.',colorCond=65)

In [None]:
plt.figure()
BarChart(data=avghumidityMonth,x='month',y='avg_humidity',title='Average of humidity for every month.',colorCond=68)

In [None]:
ax=seasonScatterHour(humidity,'Average humidity - relative humidity for every hour in each season.','humidity',legend=True,textAnnot=False)
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels,['Spring','Summer','Fall','Winter'],title='season');

### windspeed - wind speed

In [None]:
trdf['windspeed'].describe()

In [None]:
windspeed = trdf[['datetime','season','windspeed']]
print('windspeed variable as numeric data is a ratio variable.')
print(f"Count of all rows is {windspeed['windspeed'].count()}.")
print('--------------------------------------------------------------')
#extract year and month.
windspeed = windspeed.assign(hour = windspeed.datetime.dt.hour, month = windspeed.datetime.dt.month ).drop(columns=['datetime'])
someExplain = ''
seasonHist(windspeed,'windspeed',someExplain,'windspeed')

In [None]:
avgWindspeedSeason = windspeed.groupby('season',as_index=False).agg(avg_windspeed=('windspeed','mean')).round(1)
avgWindspeedSeason['season']=['Spring','Summer','Fall','Winter']
avgWindspeedMonth = windspeed.groupby('month',as_index=False).agg(avg_windspeed=('windspeed','mean')).round(1)
plt.figure()
BarChart(data=avgWindspeedSeason,x='season',y='avg_windspeed',title='Average of windspeed for every season.',colorCond=14)

In [None]:
plt.figure()
BarChart(data=avgWindspeedMonth,x='month',y='avg_windspeed',title='Average of windspeed for every month.',colorCond=13.99)

In [None]:
ax=seasonScatterHour(windspeed,'Average windspeed for every hour in each season.','windspeed',legend=True,textAnnot=False)
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels,['Spring','Summer','Fall','Winter'],title='season');

### Exploring Number of rent related to holiday and workingday. 

In [None]:
dayModeRents = trdf[['season','holiday','workingday','casual','registered','total number']]


In [None]:
# there isn't any record that both of holiday and workingday become true.
dayModeRents[(dayModeRents['holiday']==1)&(dayModeRents['workingday']==1)]

#### Number of rental in different season and holiday mode.

In [None]:
holidayModes=dayModeRents.groupby(['season','holiday']).agg(CasualMean=('casual','mean'),CasualStd=('casual','std'),
                                                            RegisteredMean=('registered','mean'),RegisteredStd=('registered','std'),
                                                            TotalMean=('total number','mean'),TotalStd=('total number','std'))
holidayModes.index = holidayModes.index.set_levels([['Spring','Summer','Fall','Winter'],['Not holiday','Holiday']])
holidayModes.reset_index(inplace=True)
holidayModes=holidayModes.round(0)
holidayModes

In [None]:
title = 'The average number of "casual" bicycle rentals in different season and "holiday" modes.'
plt.figure(figsize=(10,5))
barplotGroups(data=holidayModes,x='season',y='CasualMean',hue='holiday',title=title,legendtitle='Mode');

In [None]:
title = 'The average number of "registered" bicycle rentals in different season and "holiday" modes.'
plt.figure(figsize=(10,5))
barplotGroups(data=holidayModes,x='season',y='RegisteredMean',hue='holiday',title=title,legendtitle='Mode');

In [None]:
title = 'The average number of "Total" bicycle rentals in different season and "holiday" modes.'
plt.figure(figsize=(10,5))
barplotGroups(data=holidayModes,x='season',y='TotalMean',hue='holiday',title=title,legendtitle='Mode');

In [None]:
# costomized boxplot function
def boxplot(data,x,y,hue=None,title='',legendList=None,legendLoc=None,xticklabelsList=None):
    ax= sns.boxplot(data=data,x=x,y=y,hue=hue)
    #remove additional components from the figure.
    sns.despine(top = True, left = True)
    ax.set(title=title)
    
    if legendList!=None and legendLoc!=None:
        handles, _ = ax.get_legend_handles_labels()         
        ax.legend(handles, legendList, loc=legendLoc )
    if xticklabelsList!=None:
        ax.set_xticklabels(xticklabelsList)
        
    return ax

In [None]:
plt.figure(figsize=(10,5))
title='Total number of rental distribution  in every season and every holiday mode'
ax = boxplot(data=dayModeRents,x='season',y='total number',hue='holiday',
             title=title,legendList=["Not holiday", "Holiday"],legendLoc='best',
             xticklabelsList=['Spring','Summer','Fall','Winter'])

#### Number of rental in different season and workingday mode.

In [None]:
workingdayModes=dayModeRents.groupby(['season','workingday']).agg(CasualMean=('casual','mean'),CasualStd=('casual','std'),
                                                            RegisteredMean=('registered','mean'),RegisteredStd=('registered','std'),
                                                            TotalMean=('total number','mean'),TotalStd=('total number','std'))
workingdayModes.index = workingdayModes.index.set_levels([['spring','summer','fall','winter'],['not workingday','workingday']])
workingdayModes.reset_index(inplace=True)
workingdayModes=workingdayModes.round(0)
workingdayModes

In [None]:
title = 'The average number of "casual" bicycle rentals in different season and "workingday" modes.'
plt.figure(figsize=(10,5))
barplotGroups(data=workingdayModes,x='season',y='CasualMean',hue='workingday',title=title,legendtitle='Mode');

In [None]:
title = 'The average number of "registered" bicycle rentals in different season and "workingday" modes.'
plt.figure(figsize=(10,5))
barplotGroups(data=workingdayModes,x='season',y='RegisteredMean',hue='workingday',title=title,legendtitle='Mode');

In [None]:
title = 'The average number of "total" bicycle rentals in different season and "workingday" modes.'
plt.figure(figsize=(10,5))
barplotGroups(data=workingdayModes,x='season',y='TotalMean',hue='workingday',title=title,legendtitle='Mode');

In [None]:
plt.figure(figsize=(10,5))
title='Total number of rental distribution  in every season and every workingday mode'
ax = boxplot(data=dayModeRents,x='season',y='total number',hue='workingday',
             title=title,legendList=["Not workingday", "working"],legendLoc='best',
             xticklabelsList=['Spring','Summer','Fall','Winter'])

#### Number of rental in different weather.

In [None]:
print('We have just one row record for Heavy_S_R weather then we don\'t consider it.')
trdf[trdf['weather']==4]

In [None]:
weatherRental = trdf[['weather','casual','registered','total number']]
plt.figure(figsize=(10,5))
title='casual number of rental distribution  in difference weather situation.'
ax = boxplot(data=weatherRental,x='weather',y='casual',
             title=title,xticklabelsList=['Clear','Little','Light_S_R','Heavy_S_R'])

In [None]:
weatherRental = trdf[['weather','casual','registered','total number']]
plt.figure(figsize=(10,5))
title='Registered number of rental distribution in difference weather situation.'
ax = boxplot(data=weatherRental,x='weather',y='registered',
             title=title,xticklabelsList=['Clear','Little','Light_S_R','Heavy_S_R'])

In [None]:
weatherRental = trdf[['weather','casual','registered','total number']]
plt.figure(figsize=(10,5))
title='Total number of rental distribution  in difference weather situation.'
ax = boxplot(data=weatherRental,x='weather',y='total number',
             title=title,xticklabelsList=['Clear','Little','Light_S_R','Heavy_S_R'])

In [None]:
weatherModes=weatherRental.groupby('weather').agg(CasualMean=('casual','mean'),CasualStd=('casual','std'),CasualSum=('casual','sum'),
                                                            RegisteredMean=('registered','mean'),RegisteredStd=('registered','std'),RegisteredSum=('registered','sum'),
                                                            TotalMean=('total number','mean'),TotalStd=('total number','std'),TotalSum=('total number','sum'),)
weatherModes=weatherModes.rename(index={1:'Clear',2:'Little',3:'Light_S_R',4:'Heavy_S_R'} )
weatherModes.reset_index(inplace=True)
weatherModes=weatherModes.round(0)
weatherModes


In [None]:
#plt.figure(figsize=(15,6))
ax = weatherModes.iloc[:3,:].plot(x='weather',y=['CasualMean','RegisteredMean','TotalMean'],
                  kind='bar',figsize=(12,6),rot=0,title='Average number of rental.');
#print values for each bars.
for i in ax.containers:
    ax.bar_label(i,)
#remove additional components from the figure.
sns.despine(top = True, left = True)
ax.set(yticklabels=[])
ax.tick_params(left=False)

In [None]:
ax = weatherModes.iloc[:3,:].plot(x='weather',y=['CasualSum','RegisteredSum','TotalSum'],
                  kind='bar',figsize=(12,6),rot=0,title='Sum of number of rental for two years 2011-2012.');
#print values for each bars.
for i in ax.containers:
    ax.bar_label(i,)
#remove additional components from the figure.
sns.despine(top = True, left = True)
ax.set(yticklabels=[])
ax.tick_params(left=False)

### Exploring number of rental  respected to temp

In [None]:
tempRental = trdf[['datetime','season','temp','atemp','casual','registered','total number']]

In [None]:
tempRental = tempRental.assign(week =  tempRental['datetime'].dt.isocalendar().week )

averagePerWeek = tempRental.groupby(['season','week'],as_index=False).agg(avg_W_temp=('temp','mean'),std_W_temp=('temp','std'),
                                                              avg_W_atemp=('atemp','mean'),std_W_atemp=('atemp','std'),
                                                              avg_W_casual=('casual','mean'),std_W_casual=('casual','std'),
                                                              avg_W_registered=('registered','mean'),std_W_registered=('registered','std'),
                                                              avg_W_total=('total number','mean'),std_W_total=('total number','std'))
averagePerWeek=averagePerWeek.round(0)
averagePerWeek.head()

I will use spearman approach to understand correlation between two variable. All variables\
avg_W_temp,avg_W_casual,avg_W_registered,avg_W_total don't look like normal.

In [None]:

fig, axes = plt.subplots(1, 4,figsize=(15,4))
sns.histplot(averagePerWeek['avg_W_temp'],ax=axes[0],bins=10,kde=True)
ax2=plt.subplot(1,4,2)
sns.histplot(averagePerWeek['avg_W_casual'],ax=axes[1],bins=10,kde=True)
ax3=plt.subplot(1,4,3)
sns.histplot(averagePerWeek['avg_W_registered'],ax=axes[2],bins=10,kde=True)
ax4=plt.subplot(1,4,4)
sns.histplot(averagePerWeek['avg_W_total'],ax=axes[3],bins=10,kde=True);

In [None]:
plt.figure(figsize=(8,5))
print('correlation:',averagePerWeek['avg_W_temp'].corr(averagePerWeek['avg_W_atemp'],method='spearman').round(3))
ax=sns.scatterplot(data=averagePerWeek,x='avg_W_temp',y='avg_W_atemp')
ax.set(title='Relation between average week "temprature" and avereage week "feel-like temprarue".');

In [None]:
plt.figure(figsize=(8,5))
print('correlation:',averagePerWeek['avg_W_temp'].corr(averagePerWeek['avg_W_casual'],method='spearman').round(2))
ax=sns.scatterplot(data=averagePerWeek,x='avg_W_temp',y='avg_W_casual')
ax.set(title='Relation between average week "temprature" and avereage week "casual" rental.');

In [None]:
plt.figure(figsize=(8,5))
print('correlation:',averagePerWeek['avg_W_temp'].corr(averagePerWeek['avg_W_registered'],method='spearman').round(2))
ax=sns.scatterplot(data=averagePerWeek,x='avg_W_temp',y='avg_W_registered')
ax.set(title='Relation between average week temprature and avereage week "registered" rental.');

In [None]:
plt.figure(figsize=(8,5))
print('correlation:',averagePerWeek['avg_W_temp'].corr(averagePerWeek['avg_W_total'],method='spearman').round(2))
ax=sns.scatterplot(data=averagePerWeek,x='avg_W_temp',y='avg_W_total')
ax.set(title='Relation between average week temprature and avereage week total rental.');

According to the correlation between temperature and rent in different modes,\
we can think that temperature may be more influential in the tendency to choose\
bicycle rental tendencies in casual mode.

### we can have correlation test between two variables "temp" and "total number".
the number of data is suitable big and i decide to have a test with  alph(confidence level)=0.05.

In [None]:
from scipy.stats import spearmanr
sp= spearmanr(trdf['temp'],trdf['total number'])
print(f'Correlation test for temp and total number variables:\ncorrelation:{round(sp[0],3)} and p_value:{sp[1]}')

In [None]:
sp= spearmanr(trdf['temp'],trdf['casual'])
print(f'Correlation test for temp and casual variables:\ncorrelation:{round(sp[0],3)} and p_value:{sp[1]}')

In [None]:
sp= spearmanr(trdf['temp'],trdf['registered'])
print(f'Correlation test for temp and registered variables:\ncorrelation:{round(sp[0],3)} and p_value:{sp[1]}')

Above test show us we have significant confidence the variables(number of rental) can have some relation with temp.

Because the analysis was long and this analysis also has an educational aspect, I skip the analysis of the other two variables.

### Heatmap of All continuous variables

In [None]:
corr = trdf[['temp','atemp','humidity','windspeed','casual','registered','total number']].corr(method='spearman')
plt.figure(figsize=(16, 6))
mask = np.triu(np.ones_like(corr, dtype=bool))
heatmap = sns.heatmap(corr, vmin=-1, vmax=1,mask=mask ,annot=True,cmap='coolwarm')

heatmap.set_title('Correlation Heatmap of numeric variables.', fontdict={'fontsize':12}, pad=12);

### lets go to make prediction model.