# Machine Learning with Feature Scaling: Comparison between a Logistic Regession and a Random Forest Classification model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

## The data set we are going to work on regards bike rental data in Toronto:

In [None]:
data=pd.read_csv('../input/bike-rental-toronto/bikes_sharing.csv')

In [None]:
data

## Let's consider that any date in which the bike rental count is over 200 to be "high count", and also convert the datetime column from string to actual datetime:

In [None]:
data["high_count"] = data["count"]>200
data["datetime"] = pd.to_datetime(data["datetime"])
data["hour"] = data["datetime"].dt.hour
data

## Choosing the independent (x) and dependent (y) variables:

In [None]:
X = data[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'hour']]
y=data['high_count']

## Performing a train/test split in order to avoid overfitting:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Now, since the variables show a big difference in terms of maximum and minimum values, a feature scaling is necessary to increase model accuracy:

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## The first model we are going to perform on the train data is a Logistic Regression:

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train_scaled,y_train)

In [None]:
predictions = logmodel.predict(X_test_scaled)

In [None]:
log_acc=accuracy_score(y_test,predictions)
log_acc

In [None]:
print(classification_report(y_test,predictions))

## Now, the second model we are going to perform is a Random Forest Classifier and compare to the Logistic Regression accuracy:

In [None]:
X = data[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'hour']]
y=data['high_count']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf_clf.fit(X_train, y_train)

In [None]:
pred = rf_clf.predict(X_test)

In [None]:
rf_acc=accuracy_score(y_test,pred)
rf_acc

In [None]:
print(classification_report(y_test,pred))

## Printing a confusion matrix in order to display the correct and incorrect predictions of the Random Forest Classifier model:

In [None]:
from sklearn.metrics import plot_confusion_matrix
with plt.style.context("default"):
  fig = plt.figure(figsize=(20,15))
  ax = plot_confusion_matrix(rf_clf, X_test, y_test, values_format="", cmap="Blues")

## In the Random Forest Classification model, we can also print a function called "Feature Importance", which shows the importance of each predictor on the resulting y variable:

In [None]:
df_imp = pd.DataFrame(rf_clf.feature_importances_, index=X_train.columns, columns=["Feature Importance"]).sort_values("Feature Importance", ascending=False)
df_imp

## We can clearly see that the "hour" feature shows the greatest importance in the classification model

## Displaying the accuracy results of both the Logistic Regression and the Random Forest models:

In [None]:
result=pd.DataFrame([log_acc,rf_acc],index=['Log Model Accuracy','Random Forest Accuracy'],columns=['Value'])
result

## It can be concluded that the Random Forest model showed better accuracy.