We will use the dataset linked at https://www.kaggle.com/jsphyg/weather-dataset-rattle-package. Since we are at the beginning of machine learning topics, we will not be concerned with NAN values. The variables we will use are as follows: ‘Location’, ‘MinTemp’, MaxTemp, WindGustDir, WindGustSpeed, WindSpeed9am, WindSpeed3pm, Pressure9am, Pressure3pm.

You are expected to develop a model that predicts the maximum temperature by using these variables and by using the Multiple Linear, Lasso, Ridge, ElasticNet and Polynomial regressions we have covered so far. While performing these operations, cross validation and dummy variables will also be used. The performance measurements of the models will be listed in an Excel table and the results will be stated by making an estimation for 1 observation. You can reach the excel table from this link > "https://github.com/pycoders-nl/Class4-Machine_Learning-Week22/blob/main/performance_table.xls" Note: You will see on excel table that >> sheet-1 > Observation, sheet-2> the field which you will fill into the performance measurements.

Success!

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures


In [None]:

df = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
df.head

In [None]:
df.shape

In [None]:
data = df[['Location', 'MinTemp', 'MaxTemp', 'Rainfall','WindGustDir', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 
      'Pressure9am', 'Pressure3pm','Humidity9am','Humidity3pm']].dropna().reset_index().drop(columns=['index'])
data.head()

In [None]:
data.corr()

# Between Pressure9am and Pressure3pm we have a high correlation which is expected. MaxTemp and MinTemp have also decent correlation.

In [None]:
sns.heatmap(data.corr(), cmap='mako')

In [None]:
data.describe().T

# Pressure9am and Pressure3pm have a small std.

In [None]:
col = list(data.describe())
for i in col:
    ax = plt.figure()
    ax = sns.boxplot(x=i, data=data)

In [None]:
data_dummy= pd.get_dummies(data)
data_dummy.head()

In [None]:
X = data_dummy.drop(['MaxTemp'], axis=1)
y = data_dummy['MaxTemp']
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =46)

In [None]:
print (' Test ve Train veri setlerindeki gözlem sayıları...')
print('X_train: ', np.shape(X_train))
print('y_train: ', np.shape(y_train))
print('X_test: ', np.shape(X_test))
print('y_test: ', np.shape(y_test))

In [None]:
X_train = sm.add_constant(X_train)
lm = sm.OLS(y_train, X_train)
model = lm.fit()
model.summary()

According to the results, the R-squared is more than 0.7. This shows that accuracy of the model is in medium level.

# Sklearn model

In [None]:
X = data_dummy.drop(['MaxTemp'], axis=1)
y = data_dummy['MaxTemp']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)
model = LinearRegression().fit(X_train, y_train)

In [None]:
print('train r2',model.score(X_train, y_train))
print('train rmse',np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
print('-'*40)

print('test r2',model.score(X_test, y_test))
print('test rmse',np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
print('-'*40)

print('r2 crossvalid',cross_val_score(model, X_train, y_train, cv = 10, scoring = 'r2').mean())
print('mean squaroot error-crossvalid',np.sqrt(-cross_val_score(estimator=model, X=X_test, y=y_test, scoring = "neg_mean_squared_error", cv=10)).mean())

# Ridge Regression

In [None]:
ridge_model=Ridge()
ridge_model.fit(X_train, y_train)
# y_pred=ridge_model.predict(X_test)
# y_pred_train=ridge_model.predict(X_train)
alpha_space = np.linspace(5,0.01,50)
Ridge(alpha_space)
alphaCV = RidgeCV(alphas = alpha_space, 
                  scoring = 'neg_mean_squared_error',
                  normalize = True)

In [None]:
alphaCV.fit(X_train, y_train)

In [None]:
alphaCV.alpha_

In [None]:
ridge_model = Ridge(0.001, normalize=True)
ridge_model.fit(X_train, y_train)

In [None]:
print('train r2',ridge_model.score(X_train, y_train))
print('train rmse',np.sqrt(mean_squared_error(y_train, ridge_model.predict(X_train))))
print('-'*40)

print('test r2',ridge_model.score(X_test, y_test))
print('test rmse',np.sqrt(mean_squared_error(y_test, ridge_model.predict(X_test))))
print('-'*40)

print('r2 crossvalid',cross_val_score(ridge_model, X_train, y_train, cv = 10, scoring = 'r2').mean())
print('mean squaroot error-crossvalid',np.sqrt(-cross_val_score(estimator=ridge_model, X=X_test, y=y_test, scoring = "neg_mean_squared_error", cv=10)).mean())

# Lasso

In [None]:
lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
alpha_space = np.linspace(0.01, 20, 100)
lasso_cv_model = LassoCV(alphas = alpha_space, cv = 5).fit(X_train, y_train)
lasso_cv_model.alpha_

In [None]:
lasso_model = Lasso(0.01).fit(X_train, y_train)

In [None]:
print('train r2',lasso_model.score(X_train, y_train))
print('train rmse',np.sqrt(mean_squared_error(y_train, lasso_model.predict(X_train))))
print('-'*40)

print('test r2',lasso_model.score(X_test, y_test))
print('test rmse',np.sqrt(mean_squared_error(y_test, lasso_model.predict(X_test))))
print('-'*40)

print('r2 crossvalid',cross_val_score(lasso_model, X_train, y_train, cv = 10, scoring = 'r2').mean())
print('mean squaroot error-crossvalid',np.sqrt(-cross_val_score(estimator=lasso_model, X=X_test, y=y_test, scoring = "neg_mean_squared_error", cv=10)).mean())

# Elastic

In [None]:
elastic_model = ElasticNetCV(alphas=alpha_space, l1_ratio=0.5)
elastic_model.fit(X_train,y_train)
elastic_model.l1_ratio_

In [None]:
elastic_model.alpha_

In [None]:
print('train r2',elastic_model.score(X_train, y_train))
print('train rmse',np.sqrt(mean_squared_error(y_train, elastic_model.predict(X_train))))
print('-'*40)

print('test r2',elastic_model.score(X_test, y_test))
print('test rmse',np.sqrt(mean_squared_error(y_test, elastic_model.predict(X_test))))
print('-'*40)

print('r2 crossvalid',cross_val_score(elastic_model, X_train, y_train, cv = 10, scoring = 'r2').mean())
print('mean squaroot error-crossvalid',np.sqrt(-cross_val_score(estimator=elastic_model, X=X_test, y=y_test, scoring = "neg_mean_squared_error", cv=10)).mean())

# Polynomial Regression

In [None]:
poly = PolynomialFeatures(2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)
pol_model = LinearRegression()
pol_model.fit(X_poly_train,y_train)
y_pred=pol_model.predict(X_poly_test)
y_pred_train=pol_model.predict(X_poly_train)

In [None]:
poly = PolynomialFeatures(2)
X_train = poly.fit_transform(X_train)
poly_model = LinearRegression()
poly_model.fit(X_train, y_train)
poly_model.score(X_train, y_train)
X_test = poly.fit_transform(X_test)
poly_model.score(X_test, y_test)
poly_model.score(X_train, y_train)
np.sqrt(mean_squared_error(y_train,poly_model.predict(X_train))) , np.sqrt(mean_squared_error(y_test,poly_model.predict(X_test)))
cross_val_score(poly_model, X_train, y_train, cv = 10, scoring = 'r2').mean()

In [None]:
print('train r2',pol_model.score(X_train, y_train))
print('train rmse',np.sqrt(mean_squared_error(y_train, pol_model.predict(X_train))))
print('-'*40)

print('test r2',pol_model.score(X_test, y_test))
print('test rmse',np.sqrt(mean_squared_error(y_test, pol_model.predict(X_test))))
print('-'*40)

print('r2 crossvalid',cross_val_score(pol_model, X_train, y_train, cv = 10, scoring = 'r2').mean())
print('mean squaroot error-crossvalid',np.sqrt(-cross_val_score(estimator=pol_model, X=X_test, y=y_test, scoring = "neg_mean_squared_error", cv=10)).mean())