In [None]:
import os
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

In [None]:
data = pd.read_csv('../input/weatherww2/Summary of Weather.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.describe()

The dataset contains information on weather conditions recorded on each day at various weather stations around the world. Information includes precipitation, snowfall, temperatures, wind speed and whether the day included thunderstorms or other poor weather conditions.
So our task is to predict the maximum temperature taking input feature as the minimum temperature.

plot a 2D graph to manually observe the relationship between two variables

In [None]:
data.plot(x='MinTemp', y='MaxTemp', style='o')  
plt.title('MinTemp vs MaxTemp')  
plt.xlabel('MinTemp')  
plt.ylabel('MaxTemp')  
plt.show()

Lets check average max temperature by ploting it

In [None]:
plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(data['MaxTemp'])

In [None]:
df = data.loc[:,['MinTemp','MaxTemp']]
df.shape

In [None]:
x = df['MinTemp'].values.reshape(-1,1)
y = df['MaxTemp'].values.reshape(-1,1)

Split the data as test and train

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=0)

Import LinearRegression class, instantiate it, and call the fit() method along with our training data

In [None]:
regressor = LinearRegression()
regressor.fit(x_train, y_train) #training the algorithm

LR Model basically finds the best value for the intercept and slope, which results in best line fit. check the b and m

In [None]:
#To retrieve intercept
print(regressor.intercept_)

#to retrieve slope
print(regressor.coef_)

This means that for every one unit of change in Min temperature, the change in the Max temperature is about 0.55%

Alogorithm is trained, continue with prediction

In [None]:
y_pred = regressor.predict(x_test)

Now compare the actual output values for X_test with the predicted values

In [None]:
df = pd.DataFrame({'Actual':y_test.flatten(), 'Predicted':y_pred.flatten()})
df.shape
df

We can also visualize comparision result as a bar graph. Represent only 25 records

In [None]:
df1 = df.head(25)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

Though this model is not very precise, the predicted percentages are close to the actual ones.

In [None]:
#Plot straight line with the test data
plt.scatter(x_test,y_test,color='grey')
plt.plot(x_test,y_pred,color='red',linewidth=2)

The straight line in the above graph shows the algorithm is correct.

Algorithm evaluation. Metrices used, MAE, MSE, RMSE

In [None]:
#Find the value of these metrices
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

The value of root mean squared error is 4.19, which is more than 10% of the mean value of the percentages of all the temperature i.e. 22.41. This means that our algorithm was not very accurate but can still make reasonably good predictions.

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
r2_score(y_test,y_pred)

In [None]:
plt.scatter(x_test,y_test, color='red')
plt.plot(x_test,regressor.predict(x_test))
plt.xlabel('Minimum Temperature')
plt.ylabel('Maximum Temperature')
plt.title('Minimum Vs Maximum Temperature')
plt.show()

Save the copy of predicted values in csv format using below code 

In [None]:
np.savetxt('Univariate_predicted.csv',df,delimiter=',')