# Importing Dependencies and Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection  import train_test_split
from sklearn.metrics import mean_squared_error
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/szeged-weather/weatherHistory.csv')

# Check Statistics

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Exploratory Data Analysis

## Getting list of numeric and categorical column

In [None]:
numeric_col = df.select_dtypes(include='float64').columns
categorical_col = df.select_dtypes(include='object').columns

## Checking Histograms and Skewness of Dependent Variable

In [None]:
sns.distplot(df['Temperature (C)'])

In [None]:
#skewness and kurtosis
print("Skewness: %f" % df['Temperature (C)'].skew())
print("Kurtosis: %f" % df['Temperature (C)'].kurt())

## Relationship with numerical variables

In [None]:
for i in numeric_col[1::]:

  data = pd.concat([df['Temperature (C)'], df[i]], axis=1)
  data.plot.scatter(x=i, y='Temperature (C)', ylim=(0,45))

In [None]:
df = df.drop('Loud Cover', axis=1)

## Relationship with categorical features



In [None]:
var = 'Precip Type'
data = pd.concat([df['Temperature (C)'], df[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="Temperature (C)", data=data)


var = 'Summary'
data = pd.concat([df['Temperature (C)'], df[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="Temperature (C)", data=data)


var = 'Daily Summary'
data = pd.concat([df['Temperature (C)'], df[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="Temperature (C)", data=data)

## Correlation matrix (heatmap style)

In [None]:
f,ax = plt.subplots(figsize=(14,12))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

## Visualising Count Plots and Unique Values of categorical Features

In [None]:
rcParams['figure.figsize'] = 8, 5
sns.countplot(y=df['Summary'])

In [None]:
rcParams['figure.figsize'] = 8, 5
sns.countplot(y=df['Precip Type'])

In [None]:
data['Daily Summary'].value_counts(dropna=False)

Through the obversations we have noticed that only Humidity and Visiblity (KM) would be useful for prediction

In [None]:
df = df[['Humidity', 'Visibility (km)', 'Temperature (C)']]

## Missing Values

In [None]:
for i in df.columns:
  df[i] = df[i].interpolate(method='linear')

We can use Linear Interpolation. It's a technique use to handle the missing value in Numerical variables because Linear interpolation is an imputation technique that assumes a linear relationship between data points and utilizes non-missing values from adjacent data points to compute a value for a missing data point.

## Outliers

### Finding Outliers and Removing Outliers

In [None]:
boxplot = df.boxplot(column=['Humidity'])

In [None]:
boxplot = df.boxplot(column=['Visibility (km)'])

In [None]:
boxplot = df.boxplot(column=['Temperature (C)'])

In [None]:
# find outliers for Humidity

IQR = df.Humidity.quantile(0.75) - df.Humidity.quantile(0.25)
Lower_fence = df.Humidity.quantile(0.25) - (IQR * 3)
Upper_fence = df.Humidity.quantile(0.75) + (IQR * 3)
print('Humidity outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for Visibility (km)

IQR = df['Visibility (km)'].quantile(0.75) - df['Visibility (km)'].quantile(0.25)
Lower_fence = df['Visibility (km)'].quantile(0.25) - (IQR * 3)
Upper_fence = df['Visibility (km)'].quantile(0.75) + (IQR * 3)
print('Visibility (km) outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
# find outliers for Temperature (C)

IQR = df['Temperature (C)'].quantile(0.75) - df['Temperature (C)'].quantile(0.25)
Lower_fence = df['Temperature (C)'].quantile(0.25) - (IQR * 3)
Upper_fence = df['Temperature (C)'].quantile(0.75) + (IQR * 3)
print('Temperature (C) outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

In [None]:
df[(np.abs(stats.zscore(df['Humidity'])) < 3)]
df[(np.abs(stats.zscore(df['Visibility (km)'])) < 3)]
df[(np.abs(stats.zscore(df['Temperature (C)'])) < 3)]

# Model Building

In [None]:
X = df.drop('Temperature (C)', axis = 1)
y = df['Temperature (C)']   

### Splitting whole data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Using StandardScalar to normalise the dataset

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Creating variable df_train for SGD

In [None]:
df_train=pd.DataFrame(X_train)
df_train['Temperature (C)']=y_train
df_train.head()

### SGD implementation for linear regression

In [None]:
'''
#intialise W and B to zero
Set the number of Interation, learning rate and batch size
'''

W , B, iteration, lr_rate, k = np.zeros(shape=(1,2)),0,750,0.01,25 


while iteration >= 0:

    w, b, temp_vectors, temp_intercept = W, B, np.zeros(shape=(1,2)),0 #Update W and B and set temporary terms to zero
    data = df.sample(k) # sampling random k=batch  data
    x = np.array(data.drop('Temperature (C)', axis = 1))
    y = np.array(data['Temperature (C)'])
    
    for i in range(k):
        temp_vectors += (-2)*x[i]*(y[i]-(np.dot(w,x[i])+b)) # partial differentiation with respect to w dl/dw=1/k(-2x)*(y-wTx-b)
        temp_intercept += (-2)*(y[i]-(np.dot(w,x[i])+b)) # partial differentiation with respect to b(intercept) dl/db=1/k(-2)*(y-wTx-b)
        
    W = (w-lr_rate*(temp_vectors)/k) # Updating the value of W after taking average of all it's partial differentiation
    B = (b-lr_rate*(temp_intercept)/k) # Updating the value of b(intercept) after taking average of all it's partial differentiation
    
    iteration-=1
    
    
print(W)
print(B)

### Prediction on x_test

In [None]:
y_pred = []

for i in range(len(X_test)):
    val = np.dot(W, X_test[i])+B  #val= wTx+b
    y_pred.append(np.asscalar(val))

### Plotting

In [None]:
plt.scatter(y_pred, y_pred - y_test,
          c = 'c', marker = 'o', s = 35, alpha = 0.7,
          label = 'Test data')

plt.xlabel('Predicted values')
plt.ylabel('Tailings')
plt.legend(loc = 'upper left')
plt.hlines(y = 0, xmin = 0, xmax = 60000, lw = 2, color = 'red')
plt.show()

### Checking accuracy

In [None]:
MSE = mean_squared_error(y_test, y_pred)
print('mean squared error =',MSE)

# Reference

https://github.com/krpiyush5/SGD-on-Boston-Dataset/blob/master/06%20Implement%20SGD.ipynb