In [52]:
import numpy as np
import pandas as pd

In [53]:
train_set = pd.read_csv('../../Datasets/weatherHistory/weatherHistory_Train.csv')
test_set = pd.read_csv('../../Datasets/weatherHistory/weatherHistory_Test.csv')

In [54]:
test_set

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2013-05-20 06:00:00.000 +0200,Partly Cloudy,rain,13.800000,0.77,12.3809,211.0,11.2056,0.0,1013.92,Partly cloudy throughout the day.
1,2013-05-20 07:00:00.000 +0200,Partly Cloudy,rain,14.933333,0.72,11.2056,230.0,11.2056,0.0,1014.46,Partly cloudy throughout the day.
2,2013-05-20 08:00:00.000 +0200,Partly Cloudy,rain,16.872222,0.64,14.0714,233.0,10.3523,0.0,1014.91,Partly cloudy throughout the day.
3,2013-05-20 09:00:00.000 +0200,Mostly Cloudy,rain,17.800000,0.64,12.8800,233.0,10.0464,0.0,1015.25,Partly cloudy throughout the day.
4,2013-05-20 10:00:00.000 +0200,Mostly Cloudy,rain,18.933333,0.56,20.3987,250.0,11.2700,0.0,1015.26,Partly cloudy throughout the day.
...,...,...,...,...,...,...,...,...,...,...,...
28931,2016-09-09 19:00:00.000 +0200,Partly Cloudy,rain,26.016667,0.43,10.9963,31.0,16.1000,0.0,1014.36,Partly cloudy starting in the morning.
28932,2016-09-09 20:00:00.000 +0200,Partly Cloudy,rain,24.583333,0.48,10.0947,20.0,15.5526,0.0,1015.16,Partly cloudy starting in the morning.
28933,2016-09-09 21:00:00.000 +0200,Partly Cloudy,rain,22.038889,0.56,8.9838,30.0,16.1000,0.0,1015.66,Partly cloudy starting in the morning.
28934,2016-09-09 22:00:00.000 +0200,Partly Cloudy,rain,21.522222,0.60,10.5294,20.0,16.1000,0.0,1015.95,Partly cloudy starting in the morning.


In [55]:
train_set.isnull().sum()

Formatted Date                0
Summary                       0
Precip Type                 143
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64

In [56]:
test_set.isnull().sum()

Formatted Date              0
Summary                     0
Precip Type               374
Temperature (C)             0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Loud Cover                  0
Pressure (millibars)        0
Daily Summary               0
dtype: int64

In [57]:
# filing NAN values

def replace_categorical_null(cols: list, dataset: pd.DataFrame):
    for col in cols:
        temp = dataset[col].dropna()
        mode = temp.mode()[0]
        dataset[col].fillna(mode, inplace=True)

na_cols = list(train_set.columns[train_set.isnull().any()])
replace_categorical_null(na_cols, train_set)

na_cols_test = list(test_set.columns[test_set.isnull().any()])
replace_categorical_null(na_cols_test, test_set)

In [58]:
# Encoding Categorical Columns
from sklearn.preprocessing import LabelEncoder

def encode_labels(cols: list, dataset: pd.DataFrame, source: pd.DataFrame):
    encoder = LabelEncoder()
    for col in cols:
        encoder.fit(source[col])
        dataset[col] = pd.DataFrame(encoder.fit_transform(dataset[col]))


# train_categorical_cols = list(train_set.select_dtypes(include=['object']).dtypes.index)
train_categorical_cols = ['Summary', 'Precip Type', 'Daily Summary']
test_categorical_cols = ['Summary', 'Precip Type', 'Daily Summary']

all_data = pd.concat([train_set[train_categorical_cols] , test_set[test_categorical_cols]])

encode_labels(train_categorical_cols, train_set, all_data)
encode_labels(test_categorical_cols, test_set, all_data)

In [59]:
# set the date format for 'Formatted Data' column
from datetime import datetime as dt

formate = '%Y-%m-%d %H:%M:%S.%f %z'
train_set['Formatted Date'] = train_set['Formatted Date'].map(lambda x: dt.strptime(x, formate))
test_set['Formatted Date'] = test_set['Formatted Date'].map(lambda x: dt.strptime(x, formate))

In [60]:
# Extraction Of Data & time Features

train_set['day'] = train_set['Formatted Date'].map(lambda data: data.day)
test_set['day'] = test_set['Formatted Date'].map(lambda data: data.day)

train_set['month'] = train_set['Formatted Date'].map(lambda data: data.month)
test_set['month'] = test_set['Formatted Date'].map(lambda data: data.month)

train_set['hour'] = train_set['Formatted Date'].map(lambda data: data.hour)
test_set['hour'] = test_set['Formatted Date'].map(lambda data: data.hour)

train_set = train_set.drop(columns='Formatted Date')
test_set = test_set.drop(columns='Formatted Date')

In [61]:
# splitting Data
y_train = train_set['Apparent Temperature (C)']


x_train = train_set.drop(columns='Apparent Temperature (C)')
x_test = test_set

In [63]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
# scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns=x_test.columns)

In [64]:
#adding poly features

def apply_poly(degree: int, dataset: pd.DataFrame, columns: list):
    for col in columns:
        dataset[col+"poly"] = dataset[col].apply(lambda x: np.power(x, degree))

apply_poly(4, x_train, x_train.columns)
apply_poly(4, x_test, x_test.columns)

x_train

Unnamed: 0,Summary,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,...,Humiditypoly,Wind Speed (km/h)poly,Wind Bearing (degrees)poly,Visibility (km)poly,Loud Coverpoly,Pressure (millibars)poly,Daily Summarypoly,daypoly,monthpoly,hourpoly
0,0.76,0.0,0.506975,0.89,0.221130,0.699164,0.983,0.0,0.970135,0.938889,...,0.627422,0.002391,0.238956,0.933714,0.0,0.885786,0.777064,0.00000,0.005532,0.000000
1,0.76,0.0,0.505085,0.86,0.223399,0.721448,0.983,0.0,0.970613,0.938889,...,0.547008,0.002491,0.270908,0.933714,0.0,0.887533,0.777064,0.00000,0.005532,0.000004
2,0.68,0.0,0.505445,0.89,0.061523,0.568245,0.929,0.0,0.970909,0.938889,...,0.627422,0.000014,0.104266,0.744840,0.0,0.888617,0.777064,0.00000,0.005532,0.000057
3,0.76,0.0,0.487805,0.83,0.220877,0.749304,0.983,0.0,0.971358,0.938889,...,0.474583,0.002380,0.315233,0.933714,0.0,0.890262,0.777064,0.00000,0.005532,0.000289
4,0.68,0.0,0.495365,0.83,0.172970,0.721448,0.983,0.0,0.971454,0.938889,...,0.474583,0.000895,0.270908,0.933714,0.0,0.890613,0.777064,0.00000,0.005532,0.000915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67512,0.68,0.0,0.638556,0.84,0.166667,0.777159,0.620,0.0,0.965471,0.938889,...,0.497871,0.000772,0.364787,0.147763,0.0,0.868876,0.777064,0.16089,0.017485,0.000004
67513,0.76,0.0,0.638106,0.80,0.212557,0.749304,0.685,0.0,0.966475,0.938889,...,0.409600,0.002041,0.315233,0.220172,0.0,0.872494,0.777064,0.16089,0.017485,0.000057
67514,0.76,0.0,0.628926,0.81,0.219869,0.724234,0.983,0.0,0.966972,0.938889,...,0.430467,0.002337,0.275116,0.933714,0.0,0.874290,0.777064,0.16089,0.017485,0.000289
67515,0.76,0.0,0.602286,0.78,0.219617,0.696379,0.983,0.0,0.966981,0.938889,...,0.370151,0.002326,0.235170,0.933714,0.0,0.874324,0.777064,0.16089,0.017485,0.000915


In [65]:
#training model
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)

In [66]:
#Evaluation
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred = reg.predict(x_train)

mse_error = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse_error)
r2_error = r2_score(y_train, y_train_pred)

errors = pd.DataFrame({'Value': [mse_error, rmse, r2_error]}, index=['MSE', 'RMSE', 'R2'])
errors

Unnamed: 0,Value
MSE,0.971908
RMSE,0.985854
R2,0.992079


In [67]:
y_test_pred = reg.predict(x_test)
y_test_pred

array([10.12609022, 11.89858596, 14.25138094, ..., 21.45151096,
       20.56922696, 19.79466131])

In [68]:
#Save Predictions
prediction = pd.DataFrame(y_test_pred, columns=['Apparent Temperature (C)'])
prediction.to_csv('../../weather_pred.csv', index=False)