In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "./drive/My Drive/Data Mining Project"

Mounted at /content/drive
/content/drive/My Drive/Data Mining Project


In [None]:
import numpy as np 
import pandas as pd 
import glob
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix


from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=3,suppress=True,linewidth=1000)

In [None]:
## define function for metrics
def return_metrics(y_true, y_pred):
    print(classification_report(y_true,y_pred))
    
    cm = confusion_matrix(y_true,y_pred)
    print('TN : True Negative {}'.format(cm[0,0]))
    print('FP : False Positive {}'.format(cm[0,1]))
    print('FN : False Negative {}'.format(cm[1,0]))
    print('TP : True Positive {}'.format(cm[1,1]))
    print('Accuracy Rate: {}'.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))))
    print('Misclassification Rate: {}'.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))))
    
    return 0

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
## function for model fits

def fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array):
  ## KNN
  knn = KNeighborsClassifier(n_neighbors=10)
  knn.fit(train_model_data,train_model_labels)
  scores = cross_val_score(knn, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nKNN",scores)
  pred_labels = knn.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[0:2,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[0:2,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[0:2,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Decision Tree
  dtc = DecisionTreeClassifier()
  dtc.fit(train_model_data,train_model_labels)
  scores = cross_val_score(dtc, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nDTC",scores)
  pred_labels = dtc.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[2:4,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[2:4,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[2:4,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Naive Bayes
  nb = GaussianNB()
  nb.fit(train_model_data,train_model_labels)
  scores = cross_val_score(nb, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nNB",scores)
  pred_labels = nb.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[4:6,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[4:6,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[4:6,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Logistic Regression
  log_lr = LogisticRegression(random_state = 42)
  log_lr.fit(train_model_data,train_model_labels)
  scores = cross_val_score(log_lr, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nLOG REG",scores)
  pred_labels = log_lr.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[6:8,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[6:8,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[6:8,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Random Forest
  rfc = RandomForestClassifier(random_state = 42)
  rfc.fit(train_model_data,train_model_labels)
  scores = cross_val_score(rfc, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nRAND FOREST",scores)
  pred_labels = rfc.predict(test_model_data)
  return_metrics(test_model_labels,pred_labels)
  metrics_array[8:10,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[8:10,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[8:10,2] = f1_score(test_model_labels,pred_labels,average=None)

  print('\n METRICS ARRAY \n')
  print(metrics_array)

In [None]:
full_data = pd.read_csv('weatherAUS.csv')
print(full_data.shape)
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
train_data = pd.concat([train_data,val_data])
test_data = pd.read_csv('test.csv')
print(train_data.shape)
print(test_data.shape)

(145460, 23)
(106644, 23)
(35549, 23)


In [None]:
train_data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2016-01-21,Tuggeranong,16.1,33.0,0.0,,,N,43.0,W,W,4.0,19.0,54.0,36.0,1010.7,1007.0,,,23.8,31.7,No,Yes
1,2012-06-02,Penrith,12.0,16.5,1.2,,,E,15.0,,NNW,0.0,2.0,100.0,99.0,,,,,13.8,15.1,Yes,Yes
2,2016-05-17,Hobart,9.7,16.9,1.6,4.4,6.1,NNW,56.0,NNE,SW,24.0,11.0,66.0,44.0,1001.4,1005.4,6.0,5.0,10.9,15.7,Yes,No
3,2012-04-17,Adelaide,12.8,29.7,0.0,3.6,,SE,24.0,S,W,4.0,7.0,47.0,22.0,1023.1,1019.6,,,20.6,28.9,No,No
4,2013-08-27,Bendigo,6.6,19.3,0.0,1.4,,N,30.0,ESE,NNW,9.0,15.0,88.0,51.0,1022.1,1017.8,,,10.9,18.8,No,No


In [None]:
from sklearn.impute import SimpleImputer, KNNImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

simp_median = SimpleImputer(strategy = 'median')
simp_mode = SimpleImputer(strategy = 'most_frequent')
knn_imp = KNNImputer(n_neighbors = 5)

over_sm = SMOTE(sampling_strategy = 'minority', random_state = 1)
under = RandomUnderSampler(sampling_strategy = 1)

def impute(X, imp, num_cols, cat_cols):
  imp[0].fit(X[num_cols])
  imp[1].fit(X[cat_cols])
  return imp

# GET TRAIN DATA IN SHAPE

## Remove date column
train_data.drop('Date', axis = 1, inplace  =True)
val_data.drop('Date', axis = 1, inplace  =True)
test_data.drop('Date', axis = 1, inplace  =True)
train_data.drop('Location', axis = 1, inplace  =True)
val_data.drop('Location', axis = 1, inplace  =True)
test_data.drop('Location', axis = 1, inplace  =True)

#fetching the numerical and categorical columns
obj = train_data.drop('RainTomorrow', axis = 1).select_dtypes(include = ['O'])
num_cols = list(set(train_data.drop('RainTomorrow', axis = 1).columns) - set(obj.columns))
obj_cols = list(set(obj.columns))

## convert object columns to labels
def encode_labels(data,c1):
    from sklearn import preprocessing
    # label_encoder object knows how to understand word labels.
    label_encoder = preprocessing.LabelEncoder()
 
    # Encode labels in column 'species'.
    data[c1]= label_encoder.fit_transform(data[c1].astype(str))
 
    data[c1].unique()
    return data

column_names = list(train_data.columns)
for column_name in column_names:
  if train_data[column_name].dtype == object:
    train_data = encode_labels(train_data,column_name)

## get labels and drop label column from 
train_data_x = train_data.drop('RainTomorrow', axis = 1)
train_labels = train_data['RainTomorrow']

## convert object columns to labels
column_names_test = list(test_data.columns)
for column_name in column_names_test:
  if test_data[column_name].dtype == object:
    test_data = encode_labels(test_data,column_name)

## get labels and drop label column from 
test_data_x = test_data.drop('RainTomorrow', axis = 1)
test_labels = test_data['RainTomorrow']

## impute using median for numerical data and mode for categorical data
train_data_x_imp = train_data_x
test_data_x_imp = test_data_x

imp = impute(train_data_x, [simp_median, simp_mode], num_cols, obj_cols)
train_data_x_imp[num_cols] = imp[0].transform(train_data_x[num_cols])
test_data_x_imp[num_cols] = imp[0].transform(test_data_x[num_cols])

train_data_x_imp[obj_cols] = imp[1].transform(train_data_x[obj_cols])
test_data_x_imp[obj_cols] = imp[1].transform(test_data_x[obj_cols])

# oversampling the minority class mean mode imputation
train_data_x_over, train_labels_over = over_sm.fit_resample(train_data_x_imp, train_labels)

##undersampling the majority class mean mode imputation
train_data_x_under, train_labels_under = under.fit_resample(train_data_x_imp, train_labels)

## impute using KNN Imputer
train_data_x_imp_knn = knn_imp.fit_transform(train_data_x)
test_data_x_imp_knn = knn_imp.transform(test_data_x)

## oversampling the minority class knn imputation
train_data_x_over_knn, train_labels_over_knn = over_sm.fit_resample(train_data_x_imp_knn, train_labels)

##undersampling the majority class knn imputation
train_data_x_under_knn, train_labels_under_knn = under.fit_resample(train_data_x_imp_knn, train_labels)


print('TRAIN: ',train_data_x.shape,train_labels.shape)
print('TEST: ',test_data_x.shape,test_labels.shape)

print('TRAIN: ',train_data_x_imp_knn.shape,train_labels.shape)
print('TEST: ',test_data_x_imp_knn.shape,test_labels.shape)

print('TRAIN: ',train_data_x_over.shape,train_labels_over.shape)
print(train_labels_over.value_counts())
print('TRAIN: ',train_data_x_under.shape,train_labels_under.shape)


In [None]:
# Mean mode imputation
scaler = StandardScaler()
train_data_x_scaled = scaler.fit_transform(train_data_x_imp)
test_data_x_scaled = scaler.transform(test_data_x_imp)

metrics_array = np.zeros((10,3))
train_model_data = train_data_x_scaled
train_model_labels = train_labels
test_model_data = test_data_x_scaled
test_model_labels = test_labels
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array)



KNN [0.567 0.576 0.567 0.567 0.569 0.573 0.588 0.562 0.577 0.563 0.575 0.568 0.57  0.556 0.567]
              precision    recall  f1-score   support

           0       0.86      0.93      0.90     27580
           1       0.68      0.49      0.57      7969

    accuracy                           0.83     35549
   macro avg       0.77      0.71      0.73     35549
weighted avg       0.82      0.83      0.82     35549

TN : True Negative 25712
FP : False Positive 1868
FN : False Negative 4062
TP : True Positive 3907
Accuracy Rate: 0.8331879940364004
Misclassification Rate: 0.16681200596359955

DTC [0.522 0.531 0.533 0.522 0.524 0.516 0.536 0.536 0.53  0.52  0.535 0.523 0.536 0.534 0.519]
              precision    recall  f1-score   support

           0       0.86      0.86      0.86     27580
           1       0.52      0.54      0.53      7969

    accuracy                           0.78     35549
   macro avg       0.69      0.70      0.69     35549
weighted avg       0.79      0

In [None]:
# Mean mode imputation oversampled
scaler = StandardScaler()
train_data_x_scaled = scaler.fit_transform(train_data_x_over)
test_data_x_scaled = scaler.transform(test_data_x_imp)

metrics_array = np.zeros((10,3))
train_model_data = train_data_x_scaled
train_model_labels = train_labels_over
test_model_data = test_data_x_scaled
test_model_labels = test_labels
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array)





KNN [0.864 0.862 0.862 0.86  0.86  0.859 0.859 0.863 0.865 0.862 0.861 0.86  0.863 0.861 0.862]
              precision    recall  f1-score   support

           0       0.91      0.77      0.83     27580
           1       0.48      0.75      0.59      7969

    accuracy                           0.76     35549
   macro avg       0.70      0.76      0.71     35549
weighted avg       0.82      0.76      0.78     35549

TN : True Negative 21187
FP : False Positive 6393
FN : False Negative 2021
TP : True Positive 5948
Accuracy Rate: 0.7633126107626094
Misclassification Rate: 0.23668738923739063

DTC [0.84  0.84  0.838 0.839 0.838 0.835 0.841 0.84  0.845 0.837 0.839 0.839 0.839 0.84  0.841]
              precision    recall  f1-score   support

           0       0.87      0.83      0.85     27580
           1       0.50      0.58      0.54      7969

    accuracy                           0.78     35549
   macro avg       0.69      0.71      0.69     35549
weighted avg       0.79      0

In [None]:
# Mean mode imputation undersampled
scaler = StandardScaler()
train_data_x_scaled = scaler.fit_transform(train_data_x_under)
test_data_x_scaled = scaler.transform(test_data_x_imp)

metrics_array = np.zeros((10,3))
train_model_data = train_data_x_scaled
train_model_labels = train_labels_under
test_model_data = test_data_x_scaled
test_model_labels = test_labels
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array)



KNN [0.765 0.765 0.765 0.765 0.764 0.771 0.763 0.769 0.765 0.77  0.767 0.767 0.774 0.765 0.764]
              precision    recall  f1-score   support

           0       0.92      0.78      0.84     27580
           1       0.50      0.76      0.60      7969

    accuracy                           0.77     35549
   macro avg       0.71      0.77      0.72     35549
weighted avg       0.82      0.77      0.79     35549

TN : True Negative 21488
FP : False Positive 6092
FN : False Negative 1928
TP : True Positive 6041
Accuracy Rate: 0.7743959042448452
Misclassification Rate: 0.22560409575515486

DTC [0.716 0.715 0.717 0.712 0.72  0.717 0.71  0.714 0.713 0.723 0.717 0.717 0.72  0.721 0.711]
              precision    recall  f1-score   support

           0       0.90      0.71      0.79     27580
           1       0.42      0.71      0.53      7969

    accuracy                           0.71     35549
   macro avg       0.66      0.71      0.66     35549
weighted avg       0.79      0

In [None]:
# KNN imputation
scaler = StandardScaler()
train_data_x_scaled = scaler.fit_transform(train_data_x_imp_knn)
test_data_x_scaled = scaler.transform(test_data_x_imp_knn)

metrics_array = np.zeros((10,3))
train_model_data = train_data_x_scaled
train_model_labels = train_labels
test_model_data = test_data_x_scaled
test_model_labels = test_labels
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array)


KNN [0.567 0.576 0.567 0.567 0.569 0.573 0.588 0.562 0.577 0.563 0.575 0.568 0.57  0.556 0.567]
              precision    recall  f1-score   support

           0       0.86      0.93      0.90     27580
           1       0.68      0.49      0.57      7969

    accuracy                           0.83     35549
   macro avg       0.77      0.71      0.73     35549
weighted avg       0.82      0.83      0.82     35549

TN : True Negative 25712
FP : False Positive 1868
FN : False Negative 4062
TP : True Positive 3907
Accuracy Rate: 0.8331879940364004
Misclassification Rate: 0.16681200596359955

DTC [0.521 0.534 0.538 0.526 0.523 0.523 0.533 0.533 0.529 0.524 0.532 0.525 0.539 0.53  0.525]
              precision    recall  f1-score   support

           0       0.87      0.86      0.86     27580
           1       0.52      0.54      0.53      7969

    accuracy                           0.79     35549
   macro avg       0.69      0.70      0.70     35549
weighted avg       0.79      0

In [None]:
# KNN imputation oversampled
scaler = StandardScaler()
train_data_x_scaled = scaler.fit_transform(train_data_x_over_knn)
test_data_x_scaled = scaler.transform(test_data_x_imp_knn)

metrics_array = np.zeros((10,3))
train_model_data = train_data_x_scaled
train_model_labels = train_labels_over_knn
test_model_data = test_data_x_scaled
test_model_labels = test_labels
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array)




KNN [0.868 0.866 0.867 0.864 0.866 0.865 0.863 0.868 0.869 0.866 0.868 0.865 0.866 0.866 0.866]
              precision    recall  f1-score   support

           0       0.91      0.78      0.84     27580
           1       0.49      0.74      0.59      7969

    accuracy                           0.77     35549
   macro avg       0.70      0.76      0.71     35549
weighted avg       0.82      0.77      0.78     35549

TN : True Negative 21389
FP : False Positive 6191
FN : False Negative 2038
TP : True Positive 5931
Accuracy Rate: 0.7685166952656897
Misclassification Rate: 0.2314833047343104

DTC [0.846 0.843 0.84  0.839 0.844 0.843 0.845 0.84  0.849 0.84  0.841 0.848 0.847 0.844 0.844]
              precision    recall  f1-score   support

           0       0.87      0.84      0.85     27580
           1       0.50      0.56      0.53      7969

    accuracy                           0.78     35549
   macro avg       0.69      0.70      0.69     35549
weighted avg       0.79      0.

In [None]:
# KNN imputation undersampled
scaler = StandardScaler()
train_data_x_scaled = scaler.fit_transform(train_data_x_under_knn)
test_data_x_scaled = scaler.transform(test_data_x_imp_knn)

metrics_array = np.zeros((10,3))
train_model_data = train_data_x_scaled
train_model_labels = train_labels_under_knn
test_model_data = test_data_x_scaled
test_model_labels = test_labels
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array)


KNN [0.761 0.767 0.767 0.766 0.768 0.765 0.762 0.768 0.763 0.766 0.763 0.769 0.772 0.767 0.765]
              precision    recall  f1-score   support

           0       0.92      0.78      0.84     27580
           1       0.50      0.76      0.60      7969

    accuracy                           0.78     35549
   macro avg       0.71      0.77      0.72     35549
weighted avg       0.82      0.78      0.79     35549

TN : True Negative 21561
FP : False Positive 6019
FN : False Negative 1936
TP : True Positive 6033
Accuracy Rate: 0.776224366367549
Misclassification Rate: 0.22377563363245098

DTC [0.717 0.709 0.718 0.719 0.71  0.715 0.717 0.721 0.704 0.72  0.711 0.714 0.718 0.708 0.706]
              precision    recall  f1-score   support

           0       0.90      0.72      0.80     27580
           1       0.43      0.72      0.54      7969

    accuracy                           0.72     35549
   macro avg       0.66      0.72      0.67     35549
weighted avg       0.79      0.