In [49]:
import numpy as np
import pandas as pd

In [50]:
train_set = pd.read_csv('../../Datasets/weatherHistory/weatherHistory_Train.csv')
test_set = pd.read_csv('../../Datasets/weatherHistory/weatherHistory_Train.csv')

In [61]:
train_set.isnull().sum()

Summary                     0
Precip Type                 0
Temperature (C)             0
Apparent Temperature (C)    0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Loud Cover                  0
Pressure (millibars)        0
Daily Summary               0
day                         0
month                       0
year                        0
hour                        0
dtype: int64

In [62]:
test_set.isnull().sum()

Summary                     0
Precip Type                 0
Temperature (C)             0
Apparent Temperature (C)    0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Loud Cover                  0
Pressure (millibars)        0
Daily Summary               0
day                         0
month                       0
year                        0
hour                        0
dtype: int64

In [52]:
# filing NAN values

def replace_categorical_null(cols: list, dataset: pd.DataFrame):
    for col in cols:
        temp = dataset[col].dropna()
        mode = temp.mode()[0]
        dataset[col].fillna(mode, inplace=True)

na_cols = list(train_set.columns[train_set.isnull().any()])
replace_categorical_null(na_cols, train_set)

na_cols_test = list(test_set.columns[test_set.isnull().any()])
replace_categorical_null(na_cols_test, test_set)

In [53]:
# Encoding Categorical Columns
from sklearn.preprocessing import LabelEncoder

def encode_labels(cols: list, dataset: pd.DataFrame, source: pd.DataFrame):
    encoder = LabelEncoder()
    for col in cols:
        encoder.fit(source[col])
        dataset[col] = pd.DataFrame(encoder.fit_transform(dataset[col]))


# train_categorical_cols = list(train_set.select_dtypes(include=['object']).dtypes.index)
train_categorical_cols = ['Summary', 'Precip Type', 'Daily Summary']
test_categorical_cols = ['Summary', 'Precip Type', 'Daily Summary']

all_data = pd.concat([train_set[train_categorical_cols] , test_set[test_categorical_cols]])

encode_labels(train_categorical_cols, train_set, all_data)
encode_labels(test_categorical_cols, test_set, all_data)

In [54]:
# set the date format for 'Formatted Data' column
from datetime import datetime as dt

formate = '%Y-%m-%d %H:%M:%S.%f %z'
train_set['Formatted Date'] = train_set['Formatted Date'].map(lambda x: dt.strptime(x, formate))
test_set['Formatted Date'] = test_set['Formatted Date'].map(lambda x: dt.strptime(x, formate))

In [55]:
# Extraction Of Data & time Features

train_set['day'] = train_set['Formatted Date'].map(lambda data: data.day)
test_set['day'] = test_set['Formatted Date'].map(lambda data: data.day)

train_set['month'] = train_set['Formatted Date'].map(lambda data: data.month)
test_set['month'] = test_set['Formatted Date'].map(lambda data: data.month)

train_set['year'] = train_set['Formatted Date'].map(lambda data: data.year)
test_set['year'] = test_set['Formatted Date'].map(lambda data: data.year)

train_set['hour'] = train_set['Formatted Date'].map(lambda data: data.hour)
test_set['hour'] = test_set['Formatted Date'].map(lambda data: data.hour)

train_set = train_set.drop(columns='Formatted Date')
test_set = test_set.drop(columns='Formatted Date')

In [56]:
# splitting Data
y_train = train_set['Apparent Temperature (C)']
y_test = test_set['Apparent Temperature (C)']

x_train = train_set.drop(columns='Apparent Temperature (C)')
x_test = test_set.drop(columns='Apparent Temperature (C)')

In [57]:
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns=x_test.columns)

In [58]:
# Linear Regression
class LinearRegression:
    def __init__(self):
        self.weight = None

    def fit(self, x:pd.DataFrame, y:pd.DataFrame, epoch:int, learning_rate:float):
        m = len(x)
        if x.iloc[0, 0] != 1:
            x.insert(0, 'bias', 1)

        self.weight = np.ones(len(x.columns))

        for i in range(epoch):
            gradient = 2/m * x.T.dot(self.predict(x) - y)
            self.weight = self.weight - learning_rate * gradient

    def predict(self, df:pd.DataFrame):
        if df.iloc[0, 0] != 1:
            df.insert(0, 'bias', 1)
        return np.dot(df, self.weight)

In [59]:
#training model
reg = LinearRegression()
reg.fit(x_train, y_train, epoch=2500, learning_rate=0.1)

In [60]:
#Evaluation
from sklearn.metrics import mean_squared_error, r2_score
y_test_pred = reg.predict(x_test)

mse_error = mean_squared_error(y_train, y_test_pred)
rmse = np.sqrt(mse_error)
r2_error = r2_score(y_train, y_test_pred)

errors = pd.DataFrame({'Value': [mse_error, rmse, r2_error]}, index=['MSE', 'RMSE', 'R2'])
errors

Unnamed: 0,Value
MSE,1.564351
RMSE,1.25074
R2,0.987251


In [67]:
#Save Predictions
prediction = pd.DataFrame(y_test_pred, columns=['Apparent Temperature (C)'])
prediction.to_csv('prediction.csv', index=False)