LinearRegression.py


import warnings
warnings.filterwarnings('ignore')
import os
import boto3
import io
import sagemaker
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression

import pickle,gzip,urllib,json,csv

from sklearn import preprocessing

from sagemaker import get_execution_role
role = get_execution_role()

s3 = boto3.resource('s3')
bucket_name = 'aws-machinelearning-chez'
object_key = 'weatherhistory2.csv'

s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=bucket_name, Key=object_key)
response_body = response["Body"].read()
weather = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)

weather.head()

weather.columns

weather.shape

weather.describe()

weather.info()

weather.isnull().any()

weather.isnull().all()

round(100*(weather.isnull().sum()/len(weather.index)),2)

weather['Precip Type'].value_counts()

weather.loc[weather['Precip Type'].isnull(),'Precip Type']='rain'

round(100*(weather.isnull().sum()/len(weather.index)),2)

#Input binary values in type column
weather.loc[weather['Precip Type']=='rain','Precip Type']=1
weather.loc[weather['Precip Type']=='snow','Precip Type']=0

weather_num=weather[list(weather.dtypes[weather.dtypes!='odject'].index)]

weather_y = weather_num.pop('Temperature (C)')
weather_x = weather_num

train_x,test_x,train_y,test_y = train_test_split(weather_x,weather_y,test_size = 0.2,random_state=4)

train_x.head()

test_x.head()

to_drop = ['Formatted Date','Summary','Daily Summary']
weather.drop(to_drop, inplace=True, axis=1)
train_x.drop(to_drop, inplace=True, axis=1)
test_x.drop(to_drop, inplace=True, axis=1)

train_x.head()

test_x.head()

train_x.head()

model = LinearRegression()
model.fit(train_x,train_y)

prediction = model.predict(test_x)

## Calculating the error
np.mean((prediction-test_y)**2)

pd.DataFrame({'actual':test_y,
             'prediction':prediction,
             'diff':(test_y-prediction)})


#**************************************************************************************************

##Using Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(train_x,train_y)

prediction3 = regressor.predict(test_x)
np.mean((prediction3-test_y)**2)

pd.DataFrame({'actual':test_y,
             'prediction':prediction3,
             'diff':(test_y-prediction3)})

#**************************************************************************************************

## Using Random Forest 
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=10,random_state=0,n_estimators=100)
regr.fit(train_x,train_y)

prediction4 = regr.predict(test_x)
np.mean((prediction4-test_y)**2)

pd.DataFrame({'actual':test_y,
             'prediction':prediction4,
             'diff':(test_y-prediction4)})

#**************************************************************************************************