In [3]:
"""Build a predictive linear regression model for given dataset, given temperature, humidity, wind speed , 
wind bearing, visibility, pressure  predict apparent temperature"""
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [4]:
data=pd.read_csv('weatherHistory.csv')
data.drop(['Formatted Date','Summary','Precip Type','Loud Cover','Daily Summary'],axis=1,inplace=True)
data['x0']=np.ones((len(data),1))
data=data.reindex(columns=sorted(data.columns))
data = data[:10000]

In [5]:
data.skew() #Check the skewness

Apparent Temperature (C)   -0.169044
Humidity                   -0.705021
Pressure (millibars)       -6.309898
Temperature (C)            -0.021943
Visibility (km)            -0.571543
Wind Bearing (degrees)     -0.200441
Wind Speed (km/h)           1.184477
x0                          0.000000
dtype: float64

In [6]:
#sb.distplot(np.square(data['Humidity']))
#print(np.square(data['Humidity']).skew())
data['Humidity'] = np.square(data['Humidity'])
data['Wind Speed (km/h)'] = np.cbrt(data['Wind Speed (km/h)'])

In [7]:
#Standardization
data['Apparent Temperature (C)'] = (data['Apparent Temperature (C)'] - data['Apparent Temperature (C)'].mean()) / (data['Apparent Temperature (C)'].std())
data['Humidity'] = (data['Humidity'] - data['Humidity'].mean()) / (data['Humidity'].std())
data['Pressure (millibars)'] = (data['Pressure (millibars)'] - data['Pressure (millibars)'].mean()) / (data['Pressure (millibars)'].std())
data['Temperature (C)'] = (data['Temperature (C)'] - data['Temperature (C)'].mean()) / (data['Temperature (C)'].std())
data['Visibility (km)'] = (data['Visibility (km)'] - data['Visibility (km)'].mean()) / (data['Visibility (km)'].std())
data['Wind Bearing (degrees)'] = (data['Wind Bearing (degrees)'] - data['Wind Bearing (degrees)'].mean()) / (data['Wind Bearing (degrees)'].std())
data['Wind Speed (km/h)'] = (data['Wind Speed (km/h)'] - data['Wind Speed (km/h)'].mean()) / (data['Wind Speed (km/h)'].std())

In [8]:
Training_data=int(0.70*len(data))
Train_data=data.iloc[0:Training_data,:]
Test_data=data.iloc[Training_data:,:]

x_train,y_train,x_test,y_test=np.array(Train_data.iloc[:,1:]),np.array(Train_data.iloc[:,0:1]),np.array(Test_data.iloc[:,1:]),np.array(Test_data.iloc[:,0:1])

In [9]:
np.random.seed(0)
theta=np.random.rand(7)
theta

array([0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ,
       0.64589411, 0.43758721])

In [10]:
theta = np.reshape(theta,(7,1))
theta

array([[0.5488135 ],
       [0.71518937],
       [0.60276338],
       [0.54488318],
       [0.4236548 ],
       [0.64589411],
       [0.43758721]])

In [11]:
x_train.shape, y_train.shape, theta.shape

((7000, 7), (7000, 1), (7, 1))

In [12]:
def model(x_train,y_train,theta):
    learning_rate = 0.00000001
    epochs = 3000
    n = len(x_train)


    for epoch in range(epochs):
        y_pred = np.dot(x_train ,theta)
        temp = y_pred - y_train
        product = (learning_rate / n) * np.dot(x_train.T,temp)
        theta = theta - product
        cost = (1 / (2 * n)) * np.dot(temp.T, temp)
        #print(cost)
    return theta

In [13]:
theta=model(x_train,y_train,theta)

In [14]:
print(theta)
Y=np.dot(x_test,theta)
Y.shape, y_test.shape

[[0.5487995 ]
 [0.71517133]
 [0.60278174]
 [0.5448769 ]
 [0.42364254]
 [0.64587613]
 [0.43756956]]


((3000, 1), (3000, 1))

In [15]:
y_test.shape

(3000, 1)

In [16]:
mse=np.mean((Y-y_test)**2)
mse

1.9533440124572103

In [17]:
rmse=np.sqrt(mse)
rmse

1.3976208400196422