In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/szeged-weather/weatherHistory.csv")
df.head(3)

# Missing Value Analysis

In [None]:
df.isnull().sum()

In [None]:
df.fillna(method="bfill",inplace=True)
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
df.corr() 

In this dataset we will estimate the "Temperature" variable. In the correlation table, we see that the "Apparent Temperature (C)" variable has a very close correlation value to our dependent variable. So we will drop this variable from the dataset later.

# Categorical Variables

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Summary"] = le.fit_transform(df["Summary"])
# Converting "Summary" variable to numeric values.

In [None]:
df.head(3)

In [None]:
df["Formatted Date"] = pd.to_datetime(df["Formatted Date"], format = "%Y-%m-%d %H:%M:%S.%f %z") 
#It seems that the "Formatted Date" variable is in the form of a date. 
#Therefore, we will do the separation of the date as day, month, year with the "to_datetime" method in the "pandas" module.

In [None]:
df["year"] = df["Formatted Date"].apply(lambda x: x.year)
df["month"] = df["Formatted Date"].apply(lambda x: x.month)
df["day"] = df["Formatted Date"].apply(lambda x: x.day)
#We assign values ​​to new variables.

In [None]:
dms = pd.get_dummies(df["Precip Type"])
df = pd.concat([df,dms[["rain","snow"]]],axis=1)
df.drop(["Formatted Date","Summary","Daily Summary","Precip Type","Loud Cover","Apparent Temperature (C)"],axis=1,inplace=True)

The "Precip Type" variable is a categorical variable. In order for us to put this variable in our model, it should contain numerical values. To do this, we need the "get_dummies" method. This method creates new variables by converting categorical variable data to numeric values. We add these new variables to the data set. We discard the transformed old categorical variable from the data set. In addition, we remove variables that will not work for us while modeling from the dataset.

In [None]:
df.rename(columns={"Temperature (C)": "temperature","Humidity":"humidity","Wind Speed (km/h)":"wind_speed",
                   "Wind Bearing (degrees)":"wind_bearing","Visibility (km)":"visibility","Pressure (millibars)":"pressure"},inplace=True)
df.head()

I chose to change the variable names to make it more useful.

# Modelling

In [None]:
x = df.drop("temperature",axis=1)
y= df["temperature"]

from sklearn.preprocessing import StandardScaler
sc = StandardScaler().fit(x)
x = pd.DataFrame(sc.transform(x))

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)
x.head()

After selecting the dependent and independent variables, we scale the arguments. Then we divide our model into 2 different datasets: "train" set to train and "test" to measure success. We do this division to be 25% to the "test" data.

## Test Accuracy

In [None]:
from sklearn.linear_model import Ridge,Lasso,ElasticNet,RidgeCV,LassoCV,ElasticNetCV
from sklearn.metrics import mean_squared_error,r2_score
ridge_model = Ridge().fit(x_train,y_train)
y_pred = ridge_model.predict(x_test)
print("Ridge Regression Test R2 Score : %",r2_score(y_test, y_pred)*100)
lasso_model = Lasso().fit(x_train,y_train)
y_pred = lasso_model.predict(x_test)
print("Lasso Regression Test R2 Score : %",r2_score(y_test, y_pred)*100)
elastic_model = ElasticNet().fit(x_train,y_train)
y_pred = elastic_model.predict(x_test)
print("ElasticNet Regression Test R2 Score : %",r2_score(y_test, y_pred)*100)

## Train Accuracy

In [None]:
ridge_model = Ridge().fit(x_train,y_train)
y_pred = ridge_model.predict(x_train)
print("Ridge Regression Train R2 Score : %",r2_score(y_train, y_pred)*100)
lasso_model = Lasso().fit(x_train,y_train)
y_pred = lasso_model.predict(x_train)
print("Lasso Regression Train R2 Score : %",r2_score(y_train, y_pred)*100)
elastic_model = ElasticNet().fit(x_train,y_train)
y_pred = elastic_model.predict(x_train)
print("ElasticNet Regression Train R2 Score : %",r2_score(y_train, y_pred)*100)

# Model Tuning

In Ridge, Lasso and ElasticNet, the "alphas" parameter is valuable. So we try all of these parameters and use whichever is best.

In [None]:
lamdbalar = 10**np.linspace(10,-2,100)*0.5

In [None]:
ridge_cv = RidgeCV(alphas = lamdbalar).fit(x_train,y_train)
lasso_cv = LassoCV(alphas = lamdbalar).fit(x_train,y_train)
elastic_cv = ElasticNetCV(alphas = lamdbalar).fit(x_train,y_train)

In [None]:
print(ridge_cv.alpha_)
print(lasso_cv.alpha_)
print(elastic_cv.alpha_)

In [None]:
ridge_tuned = Ridge(alpha=ridge_cv.alpha_).fit(x_train,y_train)
y_pred = ridge_tuned.predict(x_test)
print("Tuned Ridge Regression R2 Score : %",r2_score(y_test, y_pred)*100)

lasso_tuned = Lasso(alpha=lasso_cv.alpha_).fit(x_train,y_train)
y_pred = lasso_tuned.predict(x_test)
print("Tuned Lasso Regression R2 Score : %",r2_score(y_test, y_pred)*100)

elastic_tuned = ElasticNet(alpha=elastic_cv.alpha_).fit(x_train,y_train)
y_pred = elastic_tuned.predict(x_test)
print("Tuned ElasticNet Regression R2 Score : %",r2_score(y_test, y_pred)*100)