In [142]:
import numpy as np
import pandas as pd

In [143]:
train_set = pd.read_csv('../../Datasets/weatherHistory/weatherHistory_Train.csv')
test_set = pd.read_csv('../../Datasets/weatherHistory/weatherHistory_Test.csv')

In [144]:
test_set

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2013-05-20 06:00:00.000 +0200,Partly Cloudy,rain,13.800000,0.77,12.3809,211.0,11.2056,0.0,1013.92,Partly cloudy throughout the day.
1,2013-05-20 07:00:00.000 +0200,Partly Cloudy,rain,14.933333,0.72,11.2056,230.0,11.2056,0.0,1014.46,Partly cloudy throughout the day.
2,2013-05-20 08:00:00.000 +0200,Partly Cloudy,rain,16.872222,0.64,14.0714,233.0,10.3523,0.0,1014.91,Partly cloudy throughout the day.
3,2013-05-20 09:00:00.000 +0200,Mostly Cloudy,rain,17.800000,0.64,12.8800,233.0,10.0464,0.0,1015.25,Partly cloudy throughout the day.
4,2013-05-20 10:00:00.000 +0200,Mostly Cloudy,rain,18.933333,0.56,20.3987,250.0,11.2700,0.0,1015.26,Partly cloudy throughout the day.
...,...,...,...,...,...,...,...,...,...,...,...
28931,2016-09-09 19:00:00.000 +0200,Partly Cloudy,rain,26.016667,0.43,10.9963,31.0,16.1000,0.0,1014.36,Partly cloudy starting in the morning.
28932,2016-09-09 20:00:00.000 +0200,Partly Cloudy,rain,24.583333,0.48,10.0947,20.0,15.5526,0.0,1015.16,Partly cloudy starting in the morning.
28933,2016-09-09 21:00:00.000 +0200,Partly Cloudy,rain,22.038889,0.56,8.9838,30.0,16.1000,0.0,1015.66,Partly cloudy starting in the morning.
28934,2016-09-09 22:00:00.000 +0200,Partly Cloudy,rain,21.522222,0.60,10.5294,20.0,16.1000,0.0,1015.95,Partly cloudy starting in the morning.


In [145]:
train_set.isnull().sum()

Formatted Date                0
Summary                       0
Precip Type                 143
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64

In [146]:
test_set.isnull().sum()

Formatted Date              0
Summary                     0
Precip Type               374
Temperature (C)             0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Loud Cover                  0
Pressure (millibars)        0
Daily Summary               0
dtype: int64

In [147]:
# filing NAN values

def replace_categorical_null(cols: list, dataset: pd.DataFrame):
    for col in cols:
        temp = dataset[col].dropna()
        mode = temp.mode()[0]
        dataset[col].fillna(mode, inplace=True)

na_cols = list(train_set.columns[train_set.isnull().any()])
replace_categorical_null(na_cols, train_set)

na_cols_test = list(test_set.columns[test_set.isnull().any()])
replace_categorical_null(na_cols_test, test_set)

In [148]:
# Encoding Categorical Columns
from sklearn.preprocessing import LabelEncoder

def encode_labels(cols: list, dataset: pd.DataFrame, source: pd.DataFrame):
    encoder = LabelEncoder()
    for col in cols:
        encoder.fit(source[col])
        dataset[col] = pd.DataFrame(encoder.fit_transform(dataset[col]))


# train_categorical_cols = list(train_set.select_dtypes(include=['object']).dtypes.index)
train_categorical_cols = ['Summary', 'Precip Type', 'Daily Summary']
test_categorical_cols = ['Summary', 'Precip Type', 'Daily Summary']

all_data = pd.concat([train_set[train_categorical_cols] , test_set[test_categorical_cols]])

encode_labels(train_categorical_cols, train_set, all_data)
encode_labels(test_categorical_cols, test_set, all_data)

In [149]:
# set the date format for 'Formatted Data' column
from datetime import datetime as dt

formate = '%Y-%m-%d %H:%M:%S.%f %z'
train_set['Formatted Date'] = train_set['Formatted Date'].map(lambda x: dt.strptime(x, formate))
test_set['Formatted Date'] = test_set['Formatted Date'].map(lambda x: dt.strptime(x, formate))

In [150]:
# Extraction Of Data & time Features

train_set['day'] = train_set['Formatted Date'].map(lambda data: data.day)
test_set['day'] = test_set['Formatted Date'].map(lambda data: data.day)

train_set['month'] = train_set['Formatted Date'].map(lambda data: data.month)
test_set['month'] = test_set['Formatted Date'].map(lambda data: data.month)

train_set['year'] = train_set['Formatted Date'].map(lambda data: data.year)
test_set['year'] = test_set['Formatted Date'].map(lambda data: data.year)

train_set['hour'] = train_set['Formatted Date'].map(lambda data: data.hour)
test_set['hour'] = test_set['Formatted Date'].map(lambda data: data.hour)

train_set = train_set.drop(columns='Formatted Date')
test_set = test_set.drop(columns='Formatted Date')

In [151]:
# splitting Data
y_train = train_set['Apparent Temperature (C)']


x_train = train_set.drop(columns='Apparent Temperature (C)')
x_test = test_set

In [152]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# scaler = MinMaxScaler()
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns=x_test.columns)

In [153]:
#training model
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)

In [154]:
#Evaluation
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred = reg.predict(x_train)

mse_error = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse_error)
r2_error = r2_score(y_train, y_train_pred)

errors = pd.DataFrame({'Value': [mse_error, rmse, r2_error]}, index=['MSE', 'RMSE', 'R2'])
errors

Unnamed: 0,Value
MSE,1.213905
RMSE,1.101774
R2,0.990107


In [155]:
y_test_pred = reg.predict(x_test)
y_test_pred

array([12.47381849, 14.01262113, 16.10691872, ..., 22.90361945,
       22.1026739 , 21.25345934])

In [156]:
#Save Predictions
prediction = pd.DataFrame(y_test_pred, columns=['Apparent Temperature (C)'])
prediction.to_csv('../../weather_pred.csv', index=False)