In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "./drive/My Drive/Data Mining Project"

Mounted at /content/drive
/content/drive/My Drive/Data Mining Project


In [None]:
import numpy as np 
import pandas as pd 
import glob
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix


from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=3,suppress=True,linewidth=1000)

In [None]:
## define function for metrics
def return_metrics(y_true, y_pred):
    print(classification_report(y_true,y_pred))
    
    cm = confusion_matrix(y_true,y_pred)
    print('TN : True Negative {}'.format(cm[0,0]))
    print('FP : False Positive {}'.format(cm[0,1]))
    print('FN : False Negative {}'.format(cm[1,0]))
    print('TP : True Positive {}'.format(cm[1,1]))
    print('Accuracy Rate: {}'.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))))
    print('Misclassification Rate: {}'.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))))
    
    return 0

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
## function for model fits

def fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array):
  ## KNN
  knn = KNeighborsClassifier(n_neighbors=10)
  knn.fit(train_model_data,train_model_labels)
  scores = cross_val_score(knn, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nKNN",scores)
  pred_labels = knn.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[0:2,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[0:2,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[0:2,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Decision Tree
  dtc = DecisionTreeClassifier()
  dtc.fit(train_model_data,train_model_labels)
  scores = cross_val_score(dtc, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nDTC",scores)
  pred_labels = dtc.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[2:4,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[2:4,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[2:4,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Naive Bayes
  nb = GaussianNB()
  nb.fit(train_model_data,train_model_labels)
  scores = cross_val_score(nb, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nNB",scores)
  pred_labels = nb.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[4:6,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[4:6,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[4:6,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Logistic Regression
  log_lr = LogisticRegression(random_state = 42)
  log_lr.fit(train_model_data,train_model_labels)
  scores = cross_val_score(log_lr, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nLOG REG",scores)
  pred_labels = log_lr.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[6:8,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[6:8,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[6:8,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Random Forest
  rfc = RandomForestClassifier(random_state = 42)
  rfc.fit(train_model_data,train_model_labels)
  scores = cross_val_score(rfc, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nRAND FOREST",scores)
  pred_labels = rfc.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[8:10,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[8:10,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[8:10,2] = f1_score(test_model_labels,pred_labels,average=None)

  print('\n METRICS ARRAY \n')
  print(metrics_array)

In [None]:
full_data = pd.read_csv('weatherAUS.csv')
print(full_data.shape)
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
train_data = pd.concat([train_data,val_data])
test_data = pd.read_csv('test.csv')
print(train_data.shape)
print(test_data.shape)

(145460, 23)
(106644, 23)
(35549, 23)


In [None]:
train_data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2016-01-21,Tuggeranong,16.1,33.0,0.0,,,N,43.0,W,W,4.0,19.0,54.0,36.0,1010.7,1007.0,,,23.8,31.7,No,Yes
1,2012-06-02,Penrith,12.0,16.5,1.2,,,E,15.0,,NNW,0.0,2.0,100.0,99.0,,,,,13.8,15.1,Yes,Yes
2,2016-05-17,Hobart,9.7,16.9,1.6,4.4,6.1,NNW,56.0,NNE,SW,24.0,11.0,66.0,44.0,1001.4,1005.4,6.0,5.0,10.9,15.7,Yes,No
3,2012-04-17,Adelaide,12.8,29.7,0.0,3.6,,SE,24.0,S,W,4.0,7.0,47.0,22.0,1023.1,1019.6,,,20.6,28.9,No,No
4,2013-08-27,Bendigo,6.6,19.3,0.0,1.4,,N,30.0,ESE,NNW,9.0,15.0,88.0,51.0,1022.1,1017.8,,,10.9,18.8,No,No


In [None]:
# GET TRAIN DATA IN SHAPE
## drop rows with nan
train_data.dropna(inplace=True)

## Convert date column to date frame type
train_data['Date'] =pd.to_datetime(train_data['Date'])

## convert object columns to labels
def encode_labels(data,c1):
    from sklearn import preprocessing
    # label_encoder object knows how to understand word labels.
    label_encoder = preprocessing.LabelEncoder()
 
    # Encode labels in column 'species'.
    data[c1]= label_encoder.fit_transform(data[c1].astype(str))
 
    data[c1].unique()
    return data

column_names = list(train_data.columns)
for column_name in column_names:
  if train_data[column_name].dtype == object:
    train_data = encode_labels(train_data,column_name)

## get labels and drop label column from 
train_data_x = train_data.drop('RainTomorrow', axis = 1)
train_labels = train_data['RainTomorrow']


# GET TEST DATA IN SHAPE
## drop rows with nan
test_data.dropna(inplace=True)

## Convert date column to date frame type
test_data['Date'] =pd.to_datetime(test_data['Date'])

## convert object columns to labels
column_names_test = list(test_data.columns)
for column_name in column_names_test:
  if test_data[column_name].dtype == object:
    test_data = encode_labels(test_data,column_name)

## get labels and drop label column from 
test_data_x = test_data.drop('RainTomorrow', axis = 1)
test_labels = test_data['RainTomorrow']


## Normalise the arrays and drop data column
scaler = StandardScaler()

train_data_x.drop(['Date'], inplace = True, axis = 1)
train_data_x.drop(['Location'], inplace = True, axis = 1)
train_data_x_scaled = scaler.fit_transform(train_data_x)

test_data_x.drop(['Date'], inplace = True, axis = 1)
test_data_x.drop(['Location'], inplace = True, axis = 1)
test_data_x_scaled = scaler.transform(test_data_x)


print('TRAIN: ',train_data_x_scaled.shape,train_labels.shape)
print('TEST: ',test_data_x_scaled.shape,test_labels.shape)

In [None]:
# no imputation

metrics_array = np.zeros((10,3))
train_model_data = train_data_x_scaled
train_model_labels = train_labels
test_model_data = test_data_x_scaled
test_model_labels = test_labels
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array)



KNN [0.6   0.6   0.569 0.585 0.593 0.584 0.594 0.592 0.595 0.577 0.585 0.595 0.579 0.604 0.579]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     11116
           1       0.67      0.52      0.59      3124

    accuracy                           0.84     14240
   macro avg       0.77      0.73      0.74     14240
weighted avg       0.83      0.84      0.83     14240

TN : True Negative 10328
FP : False Positive 788
FN : False Negative 1494
TP : True Positive 1630
Accuracy Rate: 0.839747191011236
Misclassification Rate: 0.16025280898876404

DTC [0.544 0.542 0.536 0.555 0.544 0.538 0.55  0.545 0.54  0.542 0.553 0.555 0.542 0.552 0.555]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87     11116
           1       0.54      0.56      0.55      3124

    accuracy                           0.80     14240
   macro avg       0.71      0.71      0.71     14240
weighted avg       0.80      0.8