#### In this project, we use the following algorithms to build your models:

1.  Linear Regression
2.  KNN
3.  Decision Trees
4.  Logistic Regression
5.  SVM

#### The results are reported as the accuracy of each classifier, using the following metrics when these are applicable:

1. Accuracy Score
2. Jaccard Index
3. F1-Score
4. LogLoss
5. Mean Absolute Error
6. Mean Squared Error
7. R2-Score

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

In [2]:
path='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv'
df = pd.read_csv(path)

In [3]:
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [26]:
df.describe(include = "object")

Unnamed: 0,Date,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
count,3271,3271,3271,3271,3271,3271
unique,3271,16,16,16,2,2
top,1/15/2012,W,W,E,No,No
freq,1,1425,1260,624,2422,2422


In [5]:
# Data Processing

# One Hot Encoding
df_sydney_processed = pd.get_dummies(data=df, 
                                     columns=['RainToday', 'WindGustDir', 
                                              'WindDir9am', 'WindDir3pm'])

In [13]:
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

# Training Data and Test Data

In [31]:
df_sydney_processed.drop('Date',axis=1,inplace=True)

In [32]:
df_sydney_processed = df_sydney_processed.astype(float)

In [33]:
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

### Linear Regression

#### Q1) Use the `train_test_split` function to split the `features` and `Y` dataframes with a `test_size` of `0.2` and the `random_state` set to `10`.

In [35]:
x_train, x_test, y_train, y_test = train_test_split(features,
                                                    Y,
                                                    test_size = 0.2,
                                                    random_state = 10)

#### Q2) Create and train a Linear Regression model called LinearReg using the training data ( `x_train` ,  `y_train` ).

In [39]:
LinearReg = LinearRegression()
LinearReg.fit(x_train, y_train)

LinearRegression()

#### Q3) Now use the predict method on the testing data (x_test) and save it to the array predictions.

In [42]:
prediction = LinearReg.predict(x_test)

#### Q4) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [47]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

LinearRegression_MAE = mean_absolute_error(y_test, prediction)
LinearRegression_MSE = mean_squared_error(y_test, prediction)
LinearRegression_R2 = r2_score(y_test, prediction)

In [80]:
Report = pd.DataFrame(columns = ["MAE", "MSE", "R2"])
Report.loc[0, "MAE"]=LinearRegression_MAE
Report.loc[0, "MSE"]=LinearRegression_MSE
Report.loc[0, "R2"]=LinearRegression_R2
Report.index = ["Linear Regression"]
Report

Unnamed: 0,MAE,MSE,R2
Linear Regression,0.256319,0.115721,0.427128


# KNN

#### Q6) Create and train a KNN model called KNN using the training data (x_train, y_train) with the n_neighbors parameter set to 4.¶

In [73]:
k = 4
KNN = KNeighborsClassifier(n_neighbors = k).fit(x_train,y_train)


#### Q7) Now use the predict method on the testing data (x_test) and save it to the array predictions.

In [74]:
prediction_KNN = KNN.predict(x_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Q8) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.

In [75]:
KNN_Accuracy_Score = metrics.accuracy_score(y_test, prediction_KNN)
KNN_JaccardIndex = jaccard_score(y_test, prediction_KNN)
KNN_F1_Score = f1_score(y_test, prediction_KNN)

In [87]:
Report_KNN = pd.DataFrame()
Report_KNN.loc[0, "Accuracy"]=KNN_Accuracy_Score
Report_KNN.loc[0, "Jaccard"]=KNN_JaccardIndex
Report_KNN.loc[0, "F1_Score"]=KNN_F1_Score
Report_KNN.index = ["KNN"]
Report_KNN


Unnamed: 0,Accuracy,Jaccard,F1_Score
KNN,0.818321,0.425121,0.59661


## Decision Tree
#### Q9) Create and train a Decision Tree model called Tree using the training data (`x_train`, `y_train`).

In [89]:
Tree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)

In [90]:
Tree.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

#### Q10) Now use the predict method on the testing data (x_test) and save it to the array predictions.

In [91]:
predict_Tree = Tree.predict(x_test)

#### Q11) Using the predictions and the y_test dataframe calculate the value for each metric using the appropriate function.¶

In [92]:
Tree_Accuracy_Score = metrics.accuracy_score(y_test, predict_Tree)
Tree_JaccardIndex = jaccard_score(y_test, predict_Tree)
Tree_F1_Score = f1_score(y_test, predict_Tree)

In [93]:
Report_Tree = pd.DataFrame()
Report_Tree.loc[0, "Accuracy"]=Tree_Accuracy_Score
Report_Tree.loc[0, "Jaccard"]=Tree_JaccardIndex
Report_Tree.loc[0, "F1_Score"]=Tree_F1_Score
Report_Tree.index = ["Decision Tree"]
Report_Tree

Unnamed: 0,Accuracy,Jaccard,F1_Score
Decision Tree,0.818321,0.480349,0.648968


## Logistic Regression

#### Q12) Use the `train_test_split` function to split the `features` and `Y` dataframes with a `test_size` of `0.2` and the `random_state` set to `1`.

#### Q13) Create and train a LogisticRegression model called LR using the training data (`x_train`, `y_train`) with the `solver` parameter set to `liblinear`.

#### Q14) Now, use the `predict` and `predict_proba` methods on the testing data (`x_test`) and save it as 2 arrays `predictions` and `predict_proba`.

#### Q15) Using the `predictions`, `predict_proba` and the `y_test` dataframe calculate the value for each metric using the appropriate function.

In [94]:
x_train, x_test, y_train, y_test = train_test_split(features,
                                                    Y,
                                                    test_size = 0.2,
                                                    random_state = 1)

In [96]:
LR = LogisticRegression(solver='liblinear').fit(x_train,y_train)

In [98]:
LR_predictions = LR.predict(x_test)
LR_proba = LR.predict_proba(x_test)

In [99]:
LR_Accuracy_Score = metrics.accuracy_score(y_test, LR_predictions)
LR_JaccardIndex = jaccard_score(y_test, LR_predictions)
LR_F1_Score = f1_score(y_test, LR_predictions)
LR_Log_Loss = log_loss(y_test, LR_proba)

In [104]:
Report_LR = pd.DataFrame()
Report_LR.loc[0, "Accuracy"]=LR_Accuracy_Score
Report_LR.loc[0, "Jaccard"]=LR_JaccardIndex
Report_LR.loc[0, "F1_Score"]=LR_F1_Score
Report_LR.loc[0, "Log-Loss"]=LR_Log_Loss
Report_LR.index = ["Logistic Regression"]
Report_LR

Unnamed: 0,Accuracy,Jaccard,F1_Score,Log-Loss
Logistic Regression,0.836641,0.509174,0.674772,0.381008


### SVM

#### Q16) Create and train a SVM model called SVM using the training data (`x_train`, `y_train`).

#### Q17) Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.

#### Q18) Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.


In [100]:
SVM = svm.SVC(kernel='rbf')
SVM.fit(x_train, y_train) 

SVC()

In [101]:
svm_predictions = SVM.predict(x_test)

In [102]:
SVM_Accuracy_Score = metrics.accuracy_score(y_test, svm_predictions)
SVM_JaccardIndex = jaccard_score(y_test, svm_predictions)
SVM_F1_Score = f1_score(y_test, svm_predictions)

In [105]:
Report_SVM = pd.DataFrame()
Report_SVM.loc[0, "Accuracy"]=SVM_Accuracy_Score
Report_SVM.loc[0, "Jaccard"]=SVM_JaccardIndex
Report_SVM.loc[0, "F1_Score"]=SVM_F1_Score
Report_SVM.index = ["SVM"]
Report_SVM

Unnamed: 0,Accuracy,Jaccard,F1_Score
SVM,0.722137,0.0,0.0


#### Q19) Show the Accuracy,Jaccard Index,F1-Score and LogLoss in a tabular format using data frame for all of the above models.

In [108]:
Report = Report_KNN.append(Report_Tree)
Report = Report.append(Report_LR)
Report = Report.append(Report_SVM)

Report

Unnamed: 0,Accuracy,Jaccard,F1_Score,Log-Loss
KNN,0.818321,0.425121,0.59661,
Decision Tree,0.818321,0.480349,0.648968,
Logistic Regression,0.836641,0.509174,0.674772,0.381008
SVM,0.722137,0.0,0.0,
