For this project I did pick weather data from Szeged and i´ll try various regression models to see which perform best. 

My task is predict temperature, make interactive dashboard in PowerBI to understand data and show dataframe with results in the end.

There is link to public PowerBI Dashboard: 

https://app.powerbi.com/view?r=eyJrIjoiOTJiMWVjMWUtOTlhMS00YzljLTg5MjItZTM4NmUxNzBjZDNhIiwidCI6ImQyZjljZjBlLTc0ZDEtNGNiMi1hZTk5LWRmZTYyMjkxOGQ1MCIsImMiOjl9



In [None]:

import numpy as np 
import pandas as pd 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt




# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. **Importing data**

In [None]:
dataset = pd.read_csv("../input/szeged-weather/weatherHistory.csv")

# 2. **Checking data if there are some nulls and NaNs**

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum() #there are 517 null values which should be replaced

# 3. **Preprocessing**

In [None]:
#checking what´s the most often item
dataset["Precip Type"].mode()[0] 

In [None]:
#replacing null values by most often item, in this case it´s "rain"
dataset["Precip Type"] = dataset["Precip Type"].fillna(dataset["Precip Type"].mode()[0]) 


In [None]:
dataset.isna().sum() 

In [None]:
#formating column to date format
dataset["Formatted Date"] = pd.to_datetime(dataset["Formatted Date"], format = "%Y-%m-%d %H:%M:%S.%f %z") 

In [None]:
#checking dataset
dataset

In [None]:
#checking unigue values in columns
{column: len(dataset[column].unique()) for column in dataset.columns} 

In [None]:
#"loud cover" has only one unique value
dataset = dataset.drop(["Loud Cover","Daily Summary"], axis=1) 

In [None]:
#checking correlations
dataset.corr() 

In [None]:
#apparent temperature is highly correlated to temperature and should be removed
dataset = dataset.drop(["Apparent Temperature (C)"], axis=1) 

In [None]:
dataset

In [None]:
{column: len(dataset) for column in dataset.columns}

In [None]:
len((dataset.columns))

In [None]:
X = dataset

In [None]:
{column: len(X[column].unique()) for column in X.columns}

In [None]:
dataset["year"] = dataset["Formatted Date"].apply(lambda x: x.year)
dataset["month"] = dataset["Formatted Date"].apply(lambda x: x.month)
dataset["day"] = dataset["Formatted Date"].apply(lambda x: x.day)

In [None]:
dataset

In [None]:
dataset.info()

In [None]:
dataset = dataset.drop(["Formatted Date"], axis=1)

In [None]:
dataset

# 4. **Label encoding text values**

In [None]:
le = preprocessing.LabelEncoder()

In [None]:
le.fit(dataset["Summary"])


In [None]:
list(le.classes_)

In [None]:
dataset["Summary"] = le.transform(dataset["Summary"])

In [None]:
dataset

In [None]:
le.fit(dataset["Precip Type"])

In [None]:
dataset["Precip Type"] = le.transform(dataset["Precip Type"])

In [None]:
#all columns are numeric and scaled
dataset.info()

# 5. **Test, train split and fitting to models**

In [None]:
y = dataset["Temperature (C)"]

In [None]:
X = dataset.drop(["Temperature (C)"], axis = 1)

In [None]:
X

In [None]:
sc = StandardScaler()
sc.fit(X)
X = pd.DataFrame(sc.transform(X), columns=X.columns)

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

In [None]:
#new dictionary for models and names
models = { 
    
                "Linear regression": LinearRegression(),
                 "Ridge regression": Ridge(),
                 "Lasso regression": Lasso(),
           "Elastic Net regression": ElasticNet(),
   "K-nearest Neighbors regression": KNeighborsRegressor(),
         "Decision Tree regression": DecisionTreeRegressor(),
'Support Vector Machine regression': SVR(),
         "Random Forest Regression": RandomForestRegressor()
    
            

}

In [None]:
models

In [None]:
#training models
for name, model in models.items():
    y_pred = model.fit(X_train, y_train)
    print(name + " Trained")
    

In [None]:
#printing R^2 results
for name, model in models.items():
    y_pred = model.predict(X_test)  
    print(name  + " R^2: {:.8f}".format(r2_score(y_test, y_pred)))

In [None]:
#printing RMSE results
for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(name  + " RMSE: {:.8f}".format(np.sqrt(mean_squared_error(y_test, y_pred))))

# 6. Preparing predicted results r2 score and RMSE for PowerBI dataframe 

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

y_pred_lr = lr.predict(X_test)

In [None]:
rr = Ridge()
rr.fit(X_train,y_train)

y_pred_rr = rr.predict(X_test)

In [None]:
lsr = Lasso()
lsr.fit(X_train,y_train)
y_pred_lsr = lsr.predict(X_test)

In [None]:
enr = ElasticNet()
enr.fit(X_train,y_train)
y_pred_enr = enr.predict(X_test)

In [None]:
knr = KNeighborsRegressor()
knr.fit(X_train,y_train)
y_pred_knr = knr.predict(X_test)

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)
y_pred_dtr = dtr.predict(X_test)

In [None]:
svr = SVR()
svr.fit(X_train,y_train)
y_pred_svr = svr.predict(X_test)

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
  
y_pred_rfr = rfr.predict(X_test)

In [None]:
r2_score(y_test, y_pred_rfr)

In [None]:
np.sqrt(mean_squared_error(y_test,y_pred_rfr))

In [None]:
model_eval = pd.DataFrame(index=["Linear Regression"], columns=["r2 score", "RMSE"])

Preparing dataframe from PowerBI dashboard

In [None]:
model_eval.loc["Linear Regression", "r2 score"] = "{:.8f}".format(r2_score(y_test, y_pred_lr))
model_eval.loc["Linear Regression", "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test,y_pred_lr)))
model_eval.loc["Ridge Regression", "r2 score"] = "{:.8f}".format(r2_score(y_test, y_pred_rr))
model_eval.loc["Ridge Regression", "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test,y_pred_rr)))
model_eval.loc["Lasso Regression", "r2 score"] = "{:.8f}".format(r2_score(y_test, y_pred_lsr))
model_eval.loc["Lasso Regression", "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test,y_pred_lsr)))
model_eval.loc["Elastic Net Regression", "r2 score"] = "{:.8f}".format(r2_score(y_test, y_pred_enr))
model_eval.loc["Elastic Net Regression", "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test,y_pred_enr)))
model_eval.loc["K-nearest Neighbors Regression", "r2 score"] = "{:.8f}".format(r2_score(y_test, y_pred_knr))
model_eval.loc["K-nearest Neighbors Regression", "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test,y_pred_knr)))
model_eval.loc["Decision Tree Regression", "r2 score"] = "{:.8f}".format(r2_score(y_test, y_pred_dtr))
model_eval.loc["Decision Tree Regression", "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test,y_pred_dtr)))
model_eval.loc["Support Vector Machine Regression", "r2 score"] = "{:.8f}".format(r2_score(y_test, y_pred_svr))
model_eval.loc["Support Vector Machine Regression", "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test,y_pred_svr)))
model_eval.loc["Random Forest Regression", "r2 score"] = "{:.8f}".format(r2_score(y_test, y_pred_rfr))
model_eval.loc["Random Forest Regression", "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test,y_pred_rfr)))

model_eval.reset_index(inplace = True)

In [None]:
#checking results in dataframe ready to pipe to PowerBI 
model_eval

# 7. Conclusion

As we can see, best results were delivered by Random Forest. Since best R^2 score is 1, i´ve managed to get 0,96 without much tuning, so score can be even better.

RMSE score is 1,85 which means Random Forest predicts by 1,85 C° accuracy.

Big disadvantage of Random Forest and also Decision Tree is slow training, training of all models took about 20 mins and 90% of this time was spent on these two. 