In [11]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [12]:
df = pd.read_csv("Train.csv")
dft = pd.read_csv("Test.csv")

In [13]:
#Step 1
df.drop(['rain_p_h','snow_p_h'],axis = 1)
print(df.shape)

(33750, 15)


In [14]:
dft.drop(['rain_p_h','snow_p_h'],axis = 1)
print(dft.shape)

(14454, 14)


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33750 entries, 0 to 33749
Data columns (total 15 columns):
date_time              33750 non-null object
is_holiday             33750 non-null object
air_pollution_index    33750 non-null int64
humidity               33750 non-null int64
wind_speed             33750 non-null int64
wind_direction         33750 non-null int64
visibility_in_miles    33750 non-null int64
dew_point              33750 non-null int64
temperature            33750 non-null float64
rain_p_h               33750 non-null float64
snow_p_h               33750 non-null float64
clouds_all             33750 non-null int64
weather_type           33750 non-null object
weather_description    33750 non-null object
traffic_volume         33750 non-null int64
dtypes: float64(3), int64(8), object(4)
memory usage: 3.9+ MB


In [16]:
df['clouds_all'].unique()
df['weather_type'].unique()

array(['Clouds', 'Clear', 'Rain', 'Drizzle', 'Mist', 'Haze', 'Fog',
       'Thunderstorm', 'Snow', 'Squall', 'Smoke'], dtype=object)

In [17]:
#Step 3 Outlier detection and Merging in Beta
outlier_labels=['Rain','Mist','Thunderstorm','Squall']
for i in outlier_labels:
    a = df[df['weather_type'] == i]
    median = a.median()
    #print(median)
    q1 = np.percentile(a.clouds_all,25)
    q3 = np.percentile(a.clouds_all,75)
    iqr = q3-q1
    minimum = q1 - (1.5 * iqr)
    maximum = q3 + (1.5 * iqr)
    a=a[(a['clouds_all']<minimum) | (a['clouds_all']>maximum)]
    merged = df.merge(a, how='left', indicator=True)
    merged=merged[merged['_merge']=='left_only']
    df=merged.drop(['_merge'],axis=1)

In [18]:
plt.figure(figsize=(16, 6))
sns.set(style="ticks", color_codes=True)
g=sns.catplot(x="weather_type", y="clouds_all", kind="box", data=df);
g.set_xticklabels(rotation=90)

AttributeError: module 'seaborn' has no attribute 'catplot'

<Figure size 1152x432 with 0 Axes>

In [19]:
#Permanenent deletion of unnecessary cols
df.drop(['visibility_in_miles','rain_p_h','snow_p_h'],inplace = True,axis = 1)
df.head(10)

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,dew_point,temperature,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,,121,89,2,329,1,288.28,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,,178,67,3,330,1,289.36,75,Clouds,broken clouds,4516
2,2012-10-02 11:00:00,,113,66,3,329,2,289.58,90,Clouds,overcast clouds,4767
3,2012-10-02 12:00:00,,20,66,3,329,5,290.13,90,Clouds,overcast clouds,5026
4,2012-10-02 13:00:00,,281,65,3,329,7,291.14,75,Clouds,broken clouds,4918
5,2012-10-02 14:00:00,,23,65,3,328,6,291.72,1,Clear,sky is clear,5181
6,2012-10-02 15:00:00,,184,64,3,328,7,293.17,1,Clear,sky is clear,5584
7,2012-10-02 16:00:00,,167,64,3,327,7,293.86,1,Clear,sky is clear,6015
8,2012-10-02 17:00:00,,119,63,3,327,6,294.14,20,Clouds,few clouds,5791
9,2012-10-02 18:00:00,,161,63,3,326,3,293.1,20,Clouds,few clouds,4770


In [20]:
df.shape

(32204, 12)

In [21]:
# Encoding the is_holiday feature column
df['is_holiday'] = np.where(df['is_holiday']=='None', 0, 1)
#Converting the date_time object to represent hour of the day
df['date_time']= pd.to_datetime(df['date_time'])
# Encoding the weather_type feature column
df['weather_type'] =df['weather_type'].astype('category').cat.codes

In [22]:
X = df.iloc[:,1:-2]
y = df.iloc[:,-1]

In [23]:
X.head()

Unnamed: 0,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,dew_point,temperature,clouds_all,weather_type
0,0,121,89,2,329,1,288.28,40,1
1,0,178,67,3,330,1,289.36,75,1
2,0,113,66,3,329,2,289.58,90,1
3,0,20,66,3,329,5,290.13,90,1
4,0,281,65,3,329,7,291.14,75,1


In [24]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# define base model
def baseline_model():
    model = Sequential()
    model.add(Dense(9, input_dim=9, kernel_initializer='normal',activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(3, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs=30, batch_size=5, verbose=1)

#kfold = KFold(n_splits=10, random_state=seed)
#results = cross_val_score(estimator, X, y, cv=kfold)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 9,train_size = 0.7)
estimator.fit(X_train, y_train)
y_predicted = estimator.predict(X_test)
print("Results: %.2f) MSE" % (mean_squared_error(y_test, y_predicted)))

ModuleNotFoundError: No module named 'keras'

In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import mean_squared_error

lasso_model = Lasso(alpha = 140,max_iter = 100000, random_state=9)
lasso_model.fit(X_train,y_train) #Lasso model
y_pred = lasso_model.predict(X_test)
mean_squared_error(y_test,y_pred)
