In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

# 1. Loading and Cleaning Data

In [2]:
df = pd.read_csv("dwell_weather_edited.csv", index_col = "Unnamed: 0")

In [3]:
df.head()

Unnamed: 0,Day Type,Day Part,Avg Trip Time (min),Month,Year,Month.1,Day,Time,Check_Flag,Average of temp,Average of feels_like,Average of temp_min,Average of temp_max,Average of pressure,Average of humidity,Sum of rain_1h,Average of wind_speed,Average of wind_gust
0,0: All Days (M-Su),05: 4am (4am-5am),20.9,May,19.0,5.0,0.0,4.0,19504,44.160541,41.684054,41.785135,47.123514,1013.972973,80.459459,8.33,4.962162,8.444483
1,0: All Days (M-Su),10: 9am (9am-10am),27.066667,May,19.0,5.0,0.0,9.0,19509,42.815676,39.944324,40.171351,45.736216,1013.216216,83.162162,6.7,5.323243,8.185625
2,0: All Days (M-Su),11: 10am (10am-11am),14.066667,May,19.0,5.0,0.0,10.0,195010,42.677429,39.514571,39.947143,45.499429,1014.457143,83.885714,6.07,5.592286,10.117692
3,0: All Days (M-Su),12: 11am (11am-12noon),20.9,May,19.0,5.0,0.0,11.0,195011,44.34697,41.048182,42.161515,47.028182,1014.69697,79.393939,6.22,6.637273,10.064839
4,0: All Days (M-Su),13: 12pm (12noon-1pm),9.966667,May,19.0,5.0,0.0,12.0,195012,46.34875,43.6625,44.13,49.299063,1014.9375,76.59375,4.51,6.826875,10.668


### 1.1 Data Cleaning

In [4]:
df = df.drop(["Month", "Check_Flag"], axis=1)

In [5]:
df = df.drop(np.where(df["Day Type"] == "0: All Days (M-Su)")[0])

In [6]:
df = df.reset_index(drop=True)

In [None]:
# sorted(df["Day Part"].unique())

In [None]:
# morning = ['05: 4am (4am-5am)','06: 5am (5am-6am)','07: 6am (6am-7am)',
#               '08: 7am (7am-8am)', '09: 8am (8am-9am)', '11: 10am (10am-11am)',]
# noon = ['12: 11am (11am-12noon)', '12: 11am (11am-12noon)','13: 12pm (12noon-1pm)',
#         '14: 1pm (1pm-2pm)','15: 2pm (2pm-3pm)','16: 3pm (3pm-4pm)']
# evening = ['17: 4pm (4pm-5pm)','18: 5pm (5pm-6pm)','19: 6pm (6pm-7pm)','20: 7pm (7pm-8pm)','21: 8pm (8pm-9pm)']

In [7]:
df.columns

Index(['Day Type', 'Day Part', 'Avg Trip Time (min)', 'Year', 'Month.1', 'Day',
       'Time', 'Average of temp', 'Average of feels_like',
       'Average of temp_min', 'Average of temp_max', 'Average of pressure',
       'Average of humidity', 'Sum of rain_1h', 'Average of wind_speed',
       'Average of wind_gust'],
      dtype='object')

In [8]:
new_df = pd.DataFrame()

In [9]:
for year in df["Year"].unique():
    for month in df["Month.1"].unique():
        for day in df["Day"].unique():
            temp_df = df.loc[np.where((df["Year"] == year) & (df["Month.1"] == month) & (df["Day"] == day))[0], :]
            new_row = [day, month, year, temp_df["Avg Trip Time (min)"].mean()] +list(temp_df.loc[:, "Average of temp":].mean(skipna=True))
            new_df = new_df.append(pd.Series(new_row), ignore_index=True)
            
#             morning_index = [i for i in temp_df.index if temp_df.loc[i, "Day Part"] in morning]
#             new_row = [1,0, day, month, year, temp_df.loc[morning_index, "Avg Trip Time (min)"].mean(skipna=True)] +list(temp_df.loc[morning_index, "Average of temp":].mean(skipna=True))
#             new_df = new_df.append(pd.Series(new_row), ignore_index=True)
            
            
#             noon_index = [i for i in temp_df.index if temp_df.loc[i, "Day Part"] in noon]
#             new_row = [0,1, day, month, year, temp_df.loc[noon_index, "Avg Trip Time (min)"].mean(skipna=True)] +list(temp_df.loc[noon_index, "Average of temp":].mean(skipna=True))
#             new_df = new_df.append(pd.Series(new_row), ignore_index=True)

            
            
#             evening_index = [i for i in temp_df.index if temp_df.loc[i, "Day Part"] in evening]
#             new_row = [0,0, day, month, year, temp_df.loc[evening_index, "Avg Trip Time (min)"].mean(skipna=True)] +list(temp_df.loc[evening_index, "Average of temp":].mean(skipna=True))
#             new_df = new_df.append(pd.Series(new_row), ignore_index=True)

            

In [10]:
new_df.columns = ["Day", "Month", "Year", 'Avg Trip Time (min)','Average of temp',
       'Average of feels_like', 'Average of temp_min', 'Average of temp_max',
       'Average of pressure', 'Average of humidity', 'Sum of rain_1h',
       'Average of wind_speed', 'Average of wind_gust']

In [None]:
# new_df.columns = ["Morning", "Afternoon", "Day", "Month", "Year", 'Avg Trip Time (min)','Average of temp',
#        'Average of feels_like', 'Average of temp_min', 'Average of temp_max',
#        'Average of pressure', 'Average of humidity', 'Sum of rain_1h',
#        'Average of wind_speed', 'Average of wind_gust']

In [11]:
df = new_df

In [12]:
df["is_summer"] = 0

In [13]:
for i in range(0, len(df)):
    if df.loc[i, "Month"] in [6,7,8]:
        df.loc[i, "is_summer"] =1

In [14]:
df.head()

Unnamed: 0,Day,Month,Year,Avg Trip Time (min),Average of temp,Average of feels_like,Average of temp_min,Average of temp_max,Average of pressure,Average of humidity,Sum of rain_1h,Average of wind_speed,Average of wind_gust,is_summer
0,1.0,5.0,19.0,17.221429,48.632929,46.856786,45.639643,52.718,1013.257143,78.792857,1.22,4.711571,7.659071,0
1,2.0,5.0,19.0,14.622222,47.245833,43.716167,43.66225,51.279583,1011.608333,68.875,0.86,9.016167,14.021528,0
2,3.0,5.0,19.0,19.761905,44.762667,40.465714,42.134429,47.884905,1016.457143,66.195238,0.364286,8.097714,13.471071,0
3,4.0,5.0,19.0,23.363636,46.319,43.705333,42.993364,50.035,1018.5,69.436364,0.207273,5.188273,8.200545,0
4,6.0,5.0,19.0,18.003333,49.428533,47.492883,46.456483,52.7014,1013.138333,79.206667,0.417,7.170933,11.451233,0


In [15]:
df["Tuesday"] = 0
df["Wednesday"] = 0
df["Thursday"] = 0 
df["Friday"] = 0 
df["Saturday"] = 0 
df["Sunday"] = 0

In [16]:
for i in range(0, len(df)):
    if df.loc[i, "Day"] == 1:
        df.loc[i, "Sunday"] = 1
    elif df.loc[i, "Day"] == 3:
        df.loc[i, "Tuesday"] = 1
    elif df.loc[i, "Day"] == 4:
        df.loc[i, "Wednesday"] = 1
    elif df.loc[i, "Day"] == 5:
        df.loc[i, "Thursday"] = 1
    elif df.loc[i, "Day"] == 6:
        df.loc[i, "Friday"] = 1
    elif df.loc[i, "Day"] == 7:
        df.loc[i, "Saturday"] = 1

In [17]:
df.columns

Index(['Day', 'Month', 'Year', 'Avg Trip Time (min)', 'Average of temp',
       'Average of feels_like', 'Average of temp_min', 'Average of temp_max',
       'Average of pressure', 'Average of humidity', 'Sum of rain_1h',
       'Average of wind_speed', 'Average of wind_gust', 'is_summer', 'Tuesday',
       'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
      dtype='object')

In [18]:
df = df.drop(["Day"], axis=1)

In [19]:
#Find and replace NaN values:
is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

In [20]:
rows_with_NaN #fuk

Unnamed: 0,Month,Year,Avg Trip Time (min),Average of temp,Average of feels_like,Average of temp_min,Average of temp_max,Average of pressure,Average of humidity,Sum of rain_1h,Average of wind_speed,Average of wind_gust,is_summer,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
42,5.0,20.0,,,,,,,,,,,0,0,0,0,0,0,1
43,5.0,20.0,,,,,,,,,,,0,0,0,0,0,0,0
45,5.0,20.0,,,,,,,,,,,0,0,1,0,0,0,0
46,5.0,20.0,,,,,,,,,,,0,0,0,0,1,0,0
47,5.0,20.0,,,,,,,,,,,0,0,0,0,0,1,0


In [21]:
df = df.drop(df[row_has_NaN].index).reset_index(drop=True)

In [22]:
to_drop = [i for i in range(0, len(df)) if (df.loc[i, "Avg Trip Time (min)"] > 60 or df.loc[i, "Avg Trip Time (min)" ] < 5)]

In [23]:
df = df.drop(to_drop, axis=0).reset_index(drop=True)

### 1.3 Data Splitting

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
columns_for_modeling = [i for i in df.columns if i != "Avg Trip Time (min)"]

In [26]:
#Main split for training and splitting:
x_train, x_test, y_train, y_test = train_test_split(df[columns_for_modeling], df["Avg Trip Time (min)"], test_size=0.33)

In [27]:
#Second split for parameter tuning:
x_train_1, x_test_val, y_train_1, y_test_val = train_test_split(x_train, y_train, test_size=0.33)

In [28]:
x_train_1.columns

Index(['Month', 'Year', 'Average of temp', 'Average of feels_like',
       'Average of temp_min', 'Average of temp_max', 'Average of pressure',
       'Average of humidity', 'Sum of rain_1h', 'Average of wind_speed',
       'Average of wind_gust', 'is_summer', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday', 'Saturday', 'Sunday'],
      dtype='object')

# 2.  ML with Feature Selection

In [29]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

### 2.1 Linear

In [30]:
reg = LinearRegression()

In [31]:
selector = RFE(reg, n_features_to_select=5, step=1)

In [32]:
selector = selector.fit(x_train, y_train)

In [33]:
lin_reg_choices = selector.support_

In [34]:
selector.ranking_

array([ 8,  1,  1, 10,  3,  2, 14, 12, 11, 13,  9,  7,  5,  1,  4,  1,  1,
        6])

In [35]:
pred =selector.predict(x_test)

In [36]:
mean_squared_error(y_test, pred, squared=False)

4.976360352966269

In [37]:
x_train.columns[np.where(lin_reg_choices == True)[0]]

Index(['Year', 'Average of temp', 'Wednesday', 'Friday', 'Saturday'], dtype='object')

### 2.2 Random Forest

In [38]:
regr_1 = RandomForestRegressor(random_state=0, criterion="mae")

In [39]:
selector = RFE(regr_1, n_features_to_select=5, step=1)

In [40]:
selector = selector.fit(x_train, y_train)

In [41]:
rf_choices  = selector.support_

In [42]:
pred =selector.predict(x_test)

In [43]:
mean_squared_error(y_test, pred, squared=False)

4.790278556103554

In [44]:
x_train.columns[np.where(rf_choices == True)[0]]

Index(['Year', 'Average of temp_min', 'Average of pressure',
       'Average of humidity', 'Average of wind_speed'],
      dtype='object')

### 2.3 Ridge

In [45]:
ridge = Ridge(alpha=1.0)

In [46]:
selector = RFE(ridge, n_features_to_select=5, step=1)

In [47]:
selector = selector.fit(x_train, y_train)

In [48]:
ridge_choices  = selector.support_

In [49]:
pred =selector.predict(x_test)

In [50]:
mean_squared_error(y_test, pred, squared=False)

5.041629724240931

In [51]:
x_train.columns[np.where(ridge_choices == True)[0]]

Index(['Year', 'Wednesday', 'Thursday', 'Friday', 'Saturday'], dtype='object')

### 2.4 Lasso

In [52]:
las = Lasso()

In [53]:
selector = RFE(las, n_features_to_select=5, step=1)

In [54]:
selector = selector.fit(x_train, y_train)

In [55]:
las_choices  = selector.support_

In [56]:
pred =selector.predict(x_test)

In [57]:
mean_squared_error(y_test, pred, squared=False)

5.299581270051477

In [58]:
x_train.columns[np.where(las_choices == True)[0]]

Index(['Month', 'Year', 'Average of feels_like', 'Average of humidity',
       'Average of wind_gust'],
      dtype='object')