In [21]:
import joblib
import pandas as pd

In [22]:
file_name = 'Random_forest_model.sav'
sav_model = joblib.load(file_name)

In [23]:
## loading the test data-set
data2 = pd.read_csv('test.csv')
data2.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered
0,30-06-2012 01:00,3,0,0,3,26.24,28.79,89,15.0013,3,55
1,30-06-2012 02:00,3,0,0,2,26.24,28.79,89,0.0,7,54
2,30-06-2012 03:00,3,0,0,2,26.24,28.79,89,0.0,3,20


## Preperation the test data

In [24]:
## Seeing if there are any dublicates.
data2.duplicated().sum()

0

In [25]:
## checking for the null value
data2.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
dtype: int64

In [26]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4399 entries, 0 to 4398
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    4399 non-null   object 
 1   season      4399 non-null   int64  
 2   holiday     4399 non-null   int64  
 3   workingday  4399 non-null   int64  
 4   weather     4399 non-null   int64  
 5   temp        4399 non-null   float64
 6   atemp       4399 non-null   float64
 7   humidity    4399 non-null   int64  
 8   windspeed   4399 non-null   float64
 9   casual      4399 non-null   int64  
 10  registered  4399 non-null   int64  
dtypes: float64(3), int64(7), object(1)
memory usage: 378.2+ KB


In [27]:
# Convert the 'datetime' column to datetime format
data2['datetime'] = pd.to_datetime(data2['datetime'])

# Extract the year from the 'datetime' column and create a new 'year' column
data2['year'] = data2['datetime'].dt.year

# Extract the month as its name from the 'datetime' column and create a new 'month' column
data2['month'] = data2['datetime'].dt.month_name()

# Extract the day as its name from the 'datetime' column and create a new 'day' column
data2['day'] = data2['datetime'].dt.day_name()

# Extract the hour from the 'datetime' column and create a new 'hour' column
data2['hour'] = data2['datetime'].dt.hour

In [28]:
# Replace the values in the 'season' column with corresponding strings
data2['season'].replace({1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'}, inplace=True)

# Replace the values in the 'holiday' column with corresponding strings
data2['holiday'].replace({1: 'Holiday', 0: 'Not Holiday'}, inplace=True)

# Replace the values in the 'workingday' column with corresponding strings
data2['workingday'].replace({1: 'Workingday', 0: 'Not Workingday'}, inplace=True)

# Replace the values in the 'weather' column with corresponding strings
data2['weather'].replace({1: 'Clear', 2: 'Mist', 3: 'Rain', 4: 'Snow'}, inplace=True)

In [29]:
# Define a mapping dictionary to combine the clusters
cluster_mapping = {"Snow" : "Rain"}

# Update the "grade" column with the new cluster labels
data2['weather'] = data2['weather'].replace(cluster_mapping)

In [30]:
# Numerical columns.
numerical_features = data2[['temp', 'atemp', 'humidity', 'windspeed']]

# calculate descriptive statistics for numerical values.
numerical_features.describe()

Unnamed: 0,temp,atemp,humidity,windspeed
count,4399.0,4399.0,4399.0,4399.0
mean,22.297513,25.784351,63.797909,11.540867
std,7.775455,8.400126,17.118806,7.68041
min,5.74,7.575,16.0,0.0
25%,14.76,18.18,50.0,7.0015
50%,22.96,26.515,65.0,11.0014
75%,28.7,32.575,78.0,16.9979
max,41.0,45.455,100.0,43.9989


In [31]:
# Replace zero 'windspeed' with the values above or below
data2['windspeed'] = data2['windspeed'].replace(0, method='ffill').replace(0, method='bfill')

In [32]:
#Store the datetime column in a separate variable.
datetime = data2['datetime']

In [33]:
## dropping the columns 
data2.drop(['datetime', 'atemp', 'windspeed'], axis=1, inplace=True)

In [34]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [35]:
# One hot Endocing .
data2 = pd.get_dummies(data2, columns=['season', 'weather', 'month', 'day'])

# Label Encoding.
label_encoder = LabelEncoder()

for i in ['holiday', 'workingday', 'year']:
    data2[i] = label_encoder.fit_transform(data2[i])

In [36]:
# List of columns to scale
columns_to_scale = ['temp', 'humidity', 'hour']

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the StandardScaler on the selected columns to calculate mean and standard deviation
scaler.fit(data2[columns_to_scale])

# Transform the selected columns using the calculated mean and standard deviation
data2[columns_to_scale] = scaler.transform(data2[columns_to_scale])

In [37]:
test_pred = sav_model.predict(data2)




In [38]:
# Create a DataFrame with the 'datetime' column and predicted rental counts.
final_data = pd.DataFrame({'datetime': datetime, 'count': test_pred})




In [39]:
## saving the DataFrame
final_data.to_csv('final_data', index=False)


