In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "./drive/My Drive/Data Mining Project"

Mounted at /content/drive
/content/drive/My Drive/Data Mining Project


In [None]:
import numpy as np 
import pandas as pd 
import glob
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix


from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=3,suppress=True,linewidth=1000)

In [None]:
## define function for metrics
def return_metrics(y_true, y_pred):
    print(classification_report(y_true,y_pred))
    
    cm = confusion_matrix(y_true,y_pred)
    print('TN : True Negative {}'.format(cm[0,0]))
    print('FP : False Positive {}'.format(cm[0,1]))
    print('FN : False Negative {}'.format(cm[1,0]))
    print('TP : True Positive {}'.format(cm[1,1]))
    print('Accuracy Rate: {}'.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))))
    print('Misclassification Rate: {}'.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))))
    
    return 0

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
## function for model fits

def fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array):
  list_scores = []
  ## KNN
  knn = KNeighborsClassifier(n_neighbors=10)
  knn.fit(train_model_data,train_model_labels)
  scores = cross_val_score(knn, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nKNN",scores,"\n",np.mean(scores),np.std(scores))
  list_scores.append(np.mean(scores))
  list_scores.append(np.std(scores))
  pred_labels = knn.predict(test_model_data)
  # return_metrics(test_model_labels,pred_labels)
  metrics_array[0:2,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[0:2,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[0:2,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Decision Tree
  dtc = DecisionTreeClassifier()
  dtc.fit(train_model_data,train_model_labels)
  scores = cross_val_score(dtc, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nDTC",scores,"\n",np.mean(scores),np.std(scores))
  list_scores.append(np.mean(scores))
  list_scores.append(np.std(scores))
  pred_labels = dtc.predict(test_model_data)
  # return_metrics(test_model_labels,pred_labels)
  metrics_array[2:4,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[2:4,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[2:4,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Naive Bayes
  nb = GaussianNB()
  nb.fit(train_model_data,train_model_labels)
  scores = cross_val_score(nb, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nNB",scores,"\n",np.mean(scores),np.std(scores))
  list_scores.append(np.mean(scores))
  list_scores.append(np.std(scores))
  pred_labels = nb.predict(test_model_data)
  # return_metrics(test_model_labels,pred_labels)
  metrics_array[4:6,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[4:6,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[4:6,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Logistic Regression
  log_lr = LogisticRegression(random_state = 42)
  log_lr.fit(train_model_data,train_model_labels)
  scores = cross_val_score(log_lr, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nLOG REG",scores,"\n",np.mean(scores),np.std(scores))
  list_scores.append(np.mean(scores))
  list_scores.append(np.std(scores))
  pred_labels = log_lr.predict(test_model_data)
  # return_metrics(test_model_labels,pred_labels)
  metrics_array[6:8,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[6:8,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[6:8,2] = f1_score(test_model_labels,pred_labels,average=None)

  ## Random Forest
  rfc = RandomForestClassifier(random_state = 42)
  rfc.fit(train_model_data,train_model_labels)
  scores = cross_val_score(rfc, train_model_data, train_model_labels, scoring='f1', cv=cv, n_jobs=-1)
  print("\nRAND FOREST",scores,"\n",np.mean(scores),np.std(scores))
  list_scores.append(np.mean(scores))
  list_scores.append(np.std(scores))
  pred_labels = rfc.predict(test_model_data)
  # return_metrics(test_model_labels,pred_labels)
  metrics_array[8:10,0] = precision_score(test_model_labels,pred_labels,average=None)
  metrics_array[8:10,1] = recall_score(test_model_labels,pred_labels,average=None)
  metrics_array[8:10,2] = f1_score(test_model_labels,pred_labels,average=None)

  print(list_scores)
  print('\n METRICS ARRAY \n')
  print(metrics_array)

In [None]:
full_data = pd.read_csv('weatherAUS.csv')
print(full_data.shape)
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
train_data = pd.concat([train_data,val_data])
test_data = pd.read_csv('test.csv')
print(train_data.shape)
print(test_data.shape)

(145460, 23)
(106644, 23)
(35549, 23)


In [None]:
from sklearn.impute import SimpleImputer, KNNImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

full_data = pd.read_csv('weatherAUS.csv')
print(full_data.shape)
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
train_data = pd.concat([train_data,val_data])
test_data = pd.read_csv('test.csv')
print(train_data.shape)
print(test_data.shape)

simp_median = SimpleImputer(strategy = 'median')
simp_mode = SimpleImputer(strategy = 'most_frequent')
knn_imp = KNNImputer(n_neighbors = 5)

over_sm = SMOTE(sampling_strategy = 'minority', random_state = 1)
under = RandomUnderSampler(sampling_strategy = 1)

def impute(X, imp, num_cols, cat_cols):
  imp[0].fit(X[num_cols])
  imp[1].fit(X[cat_cols])
  return imp

# GET TRAIN DATA IN SHAPE

feature_to_remove_list = ['MinTemp','MaxTemp','Rainfall','WindGustDir','WindDir9am','Sunshine','WindSpeed3pm','WindSpeed9am',
                'Humidity9am','Pressure9am','Cloud9am','Temp9am','RainToday','Location']
for feature in feature_to_remove_list:
  
  feature_names = list(train_data.columns)
  print(feature,feature_names)
  if feature in feature_names:
    print(feature)
    train_data = train_data.drop(feature,axis=1)  
    test_data = test_data.drop(feature,axis=1)

print(train_data.shape)
print(test_data.shape)

## Remove date column
train_data.drop('Date', axis = 1, inplace  =True)
test_data.drop('Date', axis = 1, inplace  =True)
train_data.drop('Location', axis = 1, inplace  =True)
test_data.drop('Location', axis = 1, inplace  =True)

#fetching the numerical and categorical columns
obj = train_data.drop('RainTomorrow', axis = 1).select_dtypes(include = ['O'])
num_cols = list(set(train_data.drop('RainTomorrow', axis = 1).columns) - set(obj.columns))
obj_cols = list(set(obj.columns))
print(obj_cols,num_cols)
## convert object columns to labels
def encode_labels(data,c1):
    from sklearn import preprocessing
    # label_encoder object knows how to understand word labels.
    label_encoder = preprocessing.LabelEncoder()

    # Encode labels in column 'species'.
    data[c1]= label_encoder.fit_transform(data[c1].astype(str))

    data[c1].unique()
    return data

column_names = list(train_data.columns)
for column_name in column_names:
  if train_data[column_name].dtype == object:
    train_data = encode_labels(train_data,column_name)

## get labels and drop label column from 
train_data_x = train_data.drop('RainTomorrow', axis = 1)
train_labels = train_data['RainTomorrow']

## convert object columns to labels
column_names_test = list(test_data.columns)
for column_name in column_names_test:
  if test_data[column_name].dtype == object:
    test_data = encode_labels(test_data,column_name)

## get labels and drop label column from 
test_data_x = test_data.drop('RainTomorrow', axis = 1)
test_labels = test_data['RainTomorrow']

## impute using median for numerical data and mode for categorical data
train_data_x_imp = train_data_x
test_data_x_imp = test_data_x

imp = impute(train_data_x, [simp_median, simp_mode], num_cols, obj_cols)
train_data_x_imp[num_cols] = imp[0].transform(train_data_x[num_cols])
test_data_x_imp[num_cols] = imp[0].transform(test_data_x[num_cols])

train_data_x_imp[obj_cols] = imp[1].transform(train_data_x[obj_cols])
test_data_x_imp[obj_cols] = imp[1].transform(test_data_x[obj_cols])

# oversampling the minority class mean mode imputation
train_data_x_over, train_labels_over = over_sm.fit_resample(train_data_x_imp, train_labels)

##undersampling the majority class mean mode imputation
train_data_x_under, train_labels_under = under.fit_resample(train_data_x_imp, train_labels)

## impute using KNN Imputer
train_data_x_imp_knn = knn_imp.fit_transform(train_data_x)
test_data_x_imp_knn = knn_imp.transform(test_data_x)

# ## oversampling the minority class knn imputation
train_data_x_over_knn, train_labels_over_knn = over_sm.fit_resample(train_data_x_imp_knn, train_labels)

# ##undersampling the majority class knn imputation
train_data_x_under_knn, train_labels_under_knn = under.fit_resample(train_data_x_imp_knn, train_labels)


print('TRAIN: ',train_data_x.shape,train_labels.shape)
print('TEST: ',test_data_x.shape,test_labels.shape)

print('TRAIN: ',train_data_x_imp_knn.shape,train_labels.shape)
print('TEST: ',test_data_x_imp_knn.shape,test_labels.shape)

# KNN imputation
scaler = StandardScaler()
train_data_x_scaled = scaler.fit_transform(train_data_x_imp_knn)
test_data_x_scaled = scaler.transform(test_data_x_imp_knn)

metrics_array = np.zeros((10,3))
train_model_data = train_data_x_scaled
train_model_labels = train_labels
test_model_data = test_data_x_scaled
test_model_labels = test_labels
cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=1)

fit_models(train_model_data,train_model_labels,test_model_data,test_model_labels,cv,metrics_array)