## Bike Sharing Trends

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stat
import pylab
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression



In [None]:
original_dataset=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
df=original_dataset.copy()

# Expolatory Data Analysis

In [None]:
df.head()

#### There are no null values which is a good start

In [None]:
df.isnull().sum()

#### Segregating datetime for better insights and EDA

In [None]:
df['datetime']=pd.to_datetime(df['datetime'])


df['day']=df['datetime'].map(lambda x:x.day)
df['month']=df['datetime'].map(lambda x:x.month)
df['year']=df['datetime'].map(lambda x:x.year)
df['hour']=df['datetime'].map(lambda x:x.hour)

df.drop(labels=['datetime'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2)
fig.set_size_inches(15, 13)

sns.countplot(x=df['season'], ax=axes[0][0])
sns.countplot(x=df['holiday'], ax=axes[0][1])
sns.countplot(x=df['workingday'], ax=axes[1][0])
sns.countplot(x=df['year'], ax=axes[1][1])
sns.countplot(x=df['weather'], ax=axes[2][0])
sns.countplot(x=df['month'], ax=axes[2][1])

We can clearly see that not all data is equally present for all the attributes such as -
1. Holiday
2. Working Day
3. Weather

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4)
fig.set_size_inches(12, 20)

sns.barplot(x=df['year'], y=df['count'], ax=ax1)
sns.barplot(x=df['day'], y=df['count'], ax=ax2)
sns.pointplot(x=df['hour'], y=df['count'], ax=ax3)
sns.barplot(x=df['weather'], y=df['count'], ax=ax4)


- Twice the number of poeple opted for cycles in 2012 comapred to previous year.


- In morning 8:00 AM cycles are used as  means of commute to work and in  evening 5-6 PM people use cycle to have fun.

In [None]:
fig, (ax1,ax2) = plt.subplots(nrows=2)
fig.set_size_inches(20, 12)

sns.barplot(x=df['month'], y=df['count'],ax=ax1)
sns.pointplot(x=df['hour'], y=df['count'], hue=df['workingday'],ax=ax2)

- From January to May, it is quite cold in Washington DC, that is the reason very few people opt for cycles, but from june to october the weather is quite pleasent and that is when poeple opt for the cycles the most.

- On weekends cycles are used majority of the time from 11:00 AM to 4:00 PM, which makes sense, very few people would like to get up early in the morning to cycle around on weekend

In [None]:

def Barplot(df, feature):
    sns.barplot(x=df[feature], y=df['count'])
    plt.show()


Barplot(df,'workingday')
Barplot(df, 'holiday')
Barplot(df,'season')


- Poeple use more cycles on weekends as compared to weekdays. As very few data is available for working and holiday, and still it is showing almost equivalent. Hence, more people use cycles on weekends and holidays

- Very few people on Spring use cycle which is strange, I wonder why :( 

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
sns.histplot(df['registered'], ax=ax1)
sns.histplot(df['casual'], ax=ax2)
plt.show()


- We can clearly see there were more registered users and the data is rightly skewed

# Feature Engineering

#### Using Q-Q plot to check which features are following Gausian Distribution


In [None]:
def plot_data(df, feature):
    # This is for the left graph
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    print(feature)
    df[feature].hist()
    plt.subplot(1, 2, 2)
    stat.probplot(df[feature], dist='norm', plot=pylab)
    plt.show()


plot_data(df, 'windspeed')
plot_data(df, 'humidity')
plot_data(df, 'temp')
plot_data(df, 'atemp')

####  Fixing the right skewed data of registered,windspeed and casual with <br/>**BoxCox Transformation** <br/>as most of the ML Algos perform better with gausian distribution

In [None]:
df['casual'],parameters=stat.boxcox(df['casual']+1)
plot_data(df,'casual')

df['registered'],parameters=stat.boxcox(df['registered']+1)
plot_data(df,'registered')

df['windspeed'],parameters=stat.boxcox(df['windspeed']+1)
plot_data(df,'windspeed')

#### Scaling the data with <br/> **Min-Max Scaler**

In [None]:
min_max=MinMaxScaler()
df_scaled=pd.DataFrame(min_max.fit_transform(df), columns=df.columns)

df_scaled.head()

# Feature Selection

In [None]:
X = df.drop(labels=['count'], axis=1)
y = df['count']

y.head()

def kBest(number):
    ordered_rank_features = SelectKBest(score_func=chi2)
    ordered_feature = ordered_rank_features.fit(X, y)

    dfscores = pd.DataFrame(ordered_feature.scores_, columns=["Score"])
    dfcolumns = pd.DataFrame(X.columns)

    features_rank = pd.concat([dfcolumns, dfscores], axis=1)
    features_rank.columns = ['Features', 'Score']
    return features_rank.sort_values(by=['Score'], ascending=False).head(number)


kBest(20)


- One of the important features while people are renting cycles are that is whether its humid, how is the temp to take out the cycle and hour of the day

In [None]:
def correlationMap():

    corr = df.drop(labels=['count'], axis=1).corr()
    top_features = corr.index
    plt.figure(figsize=(17, 17))
    return sns.heatmap(df[top_features].corr(), annot=True)


correlationMap()


In [None]:
def correlation_2(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            # we are interested in absolute coeff value
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)

    return col_corr

correlation_2(df,0.8)

- As we can see that 'atemp' and 'month' are highly correlated feature, so dropping those columns

In [None]:
df_scaled.drop(labels=['atemp','month'], axis=1, inplace=True)

# Model Selection

In [None]:
# from sklearn import preprocessing
# from sklearn import utils

# lab = preprocessing.LabelEncoder()
# y_transformed = lab.fit_transform(y)

# y_transformed


In [None]:
X=df_scaled.drop(labels=['count'], axis=1)
y=df_scaled['count']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=69)

X_train.shape, X_test.shape

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr=RandomForestRegressor()
rfr.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores=cross_val_score(rfr, X_train, y_train, cv=10)
print(cv_scores)
print()
print('Accuracy =',cv_scores.mean())

### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr=DecisionTreeRegressor()
dtr.fit(X_train, y_train)

In [None]:
cv_scores=cross_val_score(dtr, X_train, y_train, cv=10)
print(cv_scores)
print()
print('Accuracy =',cv_scores.mean())

In [None]:
from sklearn.linear_model import Ridge
ridge=Ridge()
ridge.fit(X_train, y_train)

In [None]:
cv_scores=cross_val_score(ridge, X_train, y_train, cv=10)
print(cv_scores)
print()
print('Accuracy =',cv_scores.mean())

### Decision Tree Regressor is giving the best Accuracy
- Testing on the Test DataSet now

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(X_test, y_test)


In [None]:
cv_scores=cross_val_score(dtr, X_test, y_test, cv=10)
print(cv_scores)
print()
print('Accuracy =',cv_scores.mean())

## Please let me know in the comments section, the mistakes I made and how can I improve more. 
## If you learned something from my work, please give it a Upvote.
## It would give me the dopamine rush :D