<a href="https://colab.research.google.com/github/shengchishih/Machine-Learning-and-Deep-Learning/blob/main/Machine%20Learning-Rain_Prediction_in_Australia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#About The Dataset

The original source of the data is Australian Government's Bureau of Meteorology and the latest data can be gathered from http://www.bom.gov.au/climate/dwo/.

The dataset to be used has extra columns like 'RainToday' and our target is 'RainTomorrow', which was gathered from the Rattle at https://bitbucket.org/kayontoga/rattle/src/master/data/weatherAUS.RData

This dataset contains observations of weather metrics for each day from 2008 to 2017.

In [None]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

In [None]:
file_path='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv'

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [None]:
df.shape

(3271, 22)

#Data Preprocessing



In [None]:
#First, we need to perform one hot encoding to convert categorical variables to binary variables.
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

Next, we replace the values of the 'RainTomorrow' column changing them from a categorical column to a binary column. We do not use the get_dummies method because we would end up with two columns for 'RainTomorrow' and we do not want, since 'RainTomorrow' is our target.

In [None]:
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

#Training Data and Test Data

In [None]:
#Now, we set our 'features' or x values and our Y or target variable.

df_sydney_processed.drop('Date',axis=1,inplace=True)
df_sydney_processed = df_sydney_processed.astype(float)

features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

#Linear Regression

Q1) Use the train_test_split function to split the features and Y dataframes with a test_size of 0.2 and the random_state set to 10.

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features, Y, test_size=0.2, random_state=10)


Q2) Create and train a Linear Regression model called LinearReg using the training data (x_train, y_train).

In [None]:
LinearReg=LinearRegression()
LinearReg.fit(X_train,Y_train)

Q3) Now use the predict method on the testing data (x_test) and save it to the array predictions.

In [None]:
predictions_Linear = LinearReg.predict(X_test)

Q4) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [None]:
from sklearn.metrics import r2_score

LinearRegression_MAE = np.mean(np.absolute(predictions_Linear - Y_test))
LinearRegression_MSE = np.mean((predictions_Linear - Y_test) ** 2)
LinearRegression_R2 = r2_score(Y_test, predictions_Linear)

Q5) Show the MAE, MSE, and R2 in a tabular format using data frame for the linear model.

In [None]:
pd.DataFrame([LinearRegression_MAE, LinearRegression_MSE, LinearRegression_R2],['MAE_Linear','MSE_Linear','R2_Linear'])

Unnamed: 0,0
MAE_Linear,0.256309
MSE_Linear,0.115719
R2_Linear,0.427138


#KNN

Q6) Create and train a KNN model called KNN using the training data (x_train, y_train) with the n_neighbors parameter set to 4

In [None]:
KNN=KNeighborsClassifier(n_neighbors = 4)
KNN.fit(X_train,Y_train)

Q7) Now use the predict method on the testing data (x_test) and save it to the array predictions

In [None]:
predictions_KNN=KNN.predict(X_test)

Q8) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [None]:
KNN_Accuracy_Score = metrics.accuracy_score(Y_test, predictions_KNN)
KNN_JaccardIndex = jaccard_score(Y_test,predictions_KNN)
KNN_F1_Score =  f1_score(Y_test, predictions_KNN)

pd.DataFrame([KNN_Accuracy_Score, KNN_JaccardIndex, KNN_F1_Score], ['Accuracy Score_KNN', 'Jaccard Index_KNN', 'F1 Score_KNN'])

Unnamed: 0,0
Accuracy Score_KNN,0.818321
Jaccard Index_KNN,0.425121
F1 Score_KNN,0.59661


# Decision Tree

Q9) Create and train a Decision Tree model called Tree using the training data (x_train, y_train)

In [None]:
Tree = DecisionTreeClassifier(max_depth=4, random_state=35)
Tree.fit(X_train, Y_train)

Q10) Now use the predict method on the testing data (x_test) and save it to the array predictions

In [None]:
predictions_Tree=Tree.predict(X_test)

Q11) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [None]:
Tree_Accuracy_Score = metrics.accuracy_score(Y_test, predictions_Tree)
Tree_JaccardIndex = jaccard_score(Y_test,predictions_Tree)
Tree_F1_Score =  f1_score(Y_test, predictions_Tree)

pd.DataFrame([Tree_Accuracy_Score, Tree_JaccardIndex, Tree_F1_Score], ['Accuracy Score_Tree', 'Jaccard Index_Tree', 'F1 Score_Tree'])

Unnamed: 0,0
Accuracy Score_Tree,0.819847
Jaccard Index_Tree,0.470852
F1 Score_Tree,0.640244


#Logistic Regression

Q12) Use the train_test_split function to split the features and Y dataframes with a test_size of 0.2 and the random_state set to 1

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size=0.2, random_state=1)

Q13) Create and train a LogisticRegression model called LR using the training data (x_train, y_train) with the solver parameter set to liblinear

In [None]:
LR = LogisticRegression(C=0.01, solver='liblinear')
LR.fit(x_train,y_train)

Q14) Now, use the predict and predict_proba methods on the testing data (x_test) and save it as 2 arrays predictions and predict_proba.

In [None]:
predict_proba= LR.predict_proba(X_test)

predictions_LR=LR.predict(X_test)

Q15) Using the predictions, predict_proba and the y_test dataframe calculate the value for each metric using the appropriate function.

In [None]:
LR_Accuracy_Score = accuracy_score(y_test, predictions_LR)
LR_JaccardIndex = jaccard_score(y_test, predictions_LR)
LR_F1_Score = f1_score(y_test, predictions_LR)
LR_Log_Loss = log_loss(y_test, predict_proba)

pd.DataFrame([LR_Accuracy_Score, LR_JaccardIndex, LR_F1_Score, LR_Log_Loss], ['Accuracy Score_LR', 'Jaccard Score_LR', 'F1 Score_LR', 'Log Loss_LR'])

Unnamed: 0,0
Accuracy Score_LR,0.624427
Jaccard Score_LR,0.133803
F1 Score_LR,0.236025
Log Loss_LR,0.933752


#SVM

Q16) Create and train a SVM model called SVM using the training data (x_train, y_train).

In [None]:
SVM = svm.SVC(kernel='rbf')
SVM.fit(x_train, y_train)

Q17) Now use the predict method on the testing data (x_test) and save it to the array predictions.

In [None]:
predictions_SVM=SVM.predict(x_test)

Q18) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [None]:
SVM_Accuracy_Score = accuracy_score(y_test, predictions_SVM)
SVM_JaccardIndex = jaccard_score(y_test, predictions_SVM)
SVM_F1_Score = f1_score(y_test, predictions_SVM)

pd.DataFrame([SVM_Accuracy_Score, SVM_JaccardIndex, SVM_F1_Score], ['Accuracy Score_SVM', 'Jaccard Score_SVM', 'F1 Score_SVM'])

Unnamed: 0,0
Accuracy Score_SVM,0.722137
Jaccard Score_SVM,0.0
F1 Score_SVM,0.0


Q19) Show the Accuracy,Jaccard Index,F1-Score and LogLoss in a tabular format using data frame for all of the above models.

In [None]:
score=['MAE_Linear','MSE_Linear','R2_Linear', 'Accuracy Score_KNN', 'Jaccard Index_KNN', 'F1 Score_KNN',
    'Accuracy Score_Tree', 'Jaccard Index_Tree', 'F1 Score_Tree', 'Accuracy Score_LR', 'Jaccard Score_LR', 'F1 Score_LR', 'Log Loss_LR',
    'Accuracy Score_SVM', 'Jaccard Score_SVM', 'F1 Score_SVM']

value=[LinearRegression_MAE, LinearRegression_MSE, LinearRegression_R2, KNN_Accuracy_Score, KNN_JaccardIndex, KNN_F1_Score,
    Tree_Accuracy_Score, Tree_JaccardIndex, Tree_F1_Score, LR_Accuracy_Score, LR_JaccardIndex, LR_F1_Score, LR_Log_Loss,
    SVM_Accuracy_Score, SVM_JaccardIndex, SVM_F1_Score]

print(pd.DataFrame(value, score))

                            0
MAE_Linear           0.256309
MSE_Linear           0.115719
R2_Linear            0.427138
Accuracy Score_KNN   0.818321
Jaccard Index_KNN    0.425121
F1 Score_KNN         0.596610
Accuracy Score_Tree  0.819847
Jaccard Index_Tree   0.470852
F1 Score_Tree        0.640244
Accuracy Score_LR    0.624427
Jaccard Score_LR     0.133803
F1 Score_LR          0.236025
Log Loss_LR          0.933752
Accuracy Score_SVM   0.722137
Jaccard Score_SVM    0.000000
F1 Score_SVM         0.000000
