In [None]:
# Load your datasets
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import json
import warnings
import os
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir("/content/drive/MyDrive/")

Mounted at /content/drive


In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/project_ALDA/ECONet/train.csv")
df_train = df_train.sample(frac=1)
df_test = pd.read_csv("/content/drive/MyDrive/project_ALDA/ECONet/test.csv")

# Feature Transformation

In [None]:
df_train["Ob"] = pd.to_datetime(df_train["Ob"],format="%m/%d/%Y %H:%M")

In [None]:
df_test["Ob"] = pd.to_datetime(df_test["Ob"],format="%m/%d/%Y %H:%M")

#Splitting the dataset into Month, Day, Hour and Minute
df_train["Month"] = df_train["Ob"].dt.month
df_train["Day"] = df_train["Ob"].dt.day
df_train["Hour"] = df_train["Ob"].dt.hour
df_train["Min"] = df_train["Ob"].dt.minute

df_test["Month"] = df_test["Ob"].dt.month
df_test["Day"] = df_test["Ob"].dt.day
df_test["Hour"] = df_test["Ob"].dt.hour
df_test["Min"] = df_test["Ob"].dt.minute

In [None]:
df_train

Unnamed: 0,Station,Ob,value,measure,target,R_flag,I_flag,Z_flag,B_flag,Month,Day,Hour,Min
2397804,FRYI,2021-03-31 10:14:00,0.297,sm,False,0,-1,-1,1,3,31,10,14
1349567,CHAP,2021-06-01 21:22:00,0.000,ws06,False,0,0,2,-1,6,1,21,22
4455029,PLYM,2021-01-30 03:52:00,-3.282,blackglobetemp,False,0,-1,-1,-1,1,30,3,52
5709619,TAYL,2021-02-15 23:34:00,0.595,sm,False,3,-1,-1,0,2,15,23,34
941936,BUCK,2021-09-20 08:25:00,0.532,sm,False,0,-1,-1,2,9,20,8,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4074654,NEWL,2021-12-14 13:12:00,0.185,sm,False,0,-1,-1,1,12,14,13,12
3109547,LAKE,2021-06-10 16:08:00,27.630,sr,False,3,0,-1,-1,6,10,16,8
2857891,JEFF,2021-09-25 11:28:00,1588.000,par,False,4,0,-1,-1,9,25,11,28
5790176,UNCA,2021-06-22 16:56:00,0.235,sm,False,0,-1,-1,1,6,22,16,56


# Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
x_train = df_train[['value', 'R_flag', 'I_flag', 'Z_flag', 'B_flag','measure','Station','Month','Day','Hour','Min','target']]
y_train = df_train["target"]
x_train,x_val,y_train,y_val=train_test_split(x_train, y_train, test_size=0.2,random_state=1)
print(len(x_train), len(y_train))


5274619 5274619


In [None]:
#Normalization of training set

# allScalers = {}
# dataframeslst = []
# for station in stations:
#   for measure in allMeasures:
#     filtered_df = x_train[(x_train.measure == measure) & (x_train.Station == station)]
#     # print(station + ' ' + measure + ' - ',filtered_df.size)
#     if filtered_df.size == 0:
#       continue
#     scaler = MinMaxScaler()
#     filtered_df.value = scaler.fit_transform(filtered_df[['value']])
#     dataframeslst.append(filtered_df)
#     allScalers[station+'-'+measure] = scaler

In [None]:
#Normalization of testing set

# dataframeslst_val = []
# for station in stations:
#   for measure in allMeasures:
#     filtered_df_val = x_val[(x_val.measure == measure) & (x_val.Station == station)]
#     # print(station + ' ' + measure + ' - ',filtered_df.size)
#     if filtered_df_val.size == 0 :
#       continue
#     if(str(station+'-'+measure) not in allScalers):
#       print(str(station+'-'+measure))
#       print(len(filtered_df_val))
#       dataframeslst_val.append(filtered_df_val)
#       continue;
#     scaler = allScalers[station+'-'+measure]
#     filtered_df_val.value = scaler.transform(filtered_df_val[['value']])
#     dataframeslst_val.append(filtered_df_val)


In [None]:
# # Try to ignore normalization
df_train_scaled = x_train
df_val_scaled = x_val

# One hot encoding

In [None]:
#Transforming the One hot encoding on station and measure attributes

one_hot_station = pd.get_dummies(df_train_scaled['Station'])
one_hot_measure = pd.get_dummies(df_train_scaled['measure'])

df_train_processed = df_train_scaled.join(one_hot_station)
df_train_processed = df_train_processed.join(one_hot_measure)

one_hot_station = pd.get_dummies(df_val_scaled['Station'])
one_hot_measure = pd.get_dummies(df_val_scaled['measure'])

df_val_processed = df_val_scaled.join(one_hot_station)
df_val_processed = df_val_processed.join(one_hot_measure)

# Sampling

In [None]:
# Here we have used multiple techniques like 
# SMOTE
# SMOTEENN
# SMOTETomek
# RandomUnderSampler
# CondensedNearestNeighbour
# OneSidedSelection

#But the best AUC-PR result came out when no sampling is provided

from imblearn.over_sampling import SMOTE 
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule
from imblearn.under_sampling import OneSidedSelection
print(Counter(y_train))

df_train_final = df_train_processed.drop(['Station','measure','target'],axis = 1)
# df_train_final = df_train_scaled[['value', 'R_flag', 'I_flag', 'Z_flag', 'B_flag']]
# sm = SMOTE(random_state=1)
# x_train_sm, y_train_sm = sm.fit_resample(df_train_final, y_train)
# smt = SMOTETomek(random_state=42)
# x_train_sm, y_train_sm = smt.fit_resample(df_train_final, y_train)

# ncr = NeighbourhoodCleaningRule(random_state=42) 
# x_train_sm, y_train_sm = ncr.fit_resample(df_train_final, y_train) 


# rus = RandomUnderSampler(random_state=42, sampling_strategy = 0.5)
# x_train_sm, y_train_sm = rus.fit_resample(df_train_final, y_train)


x_train_sm = df_train_final
y_train_sm = y_train


# print(Counter(y_train_sm))

# oss = OneSidedSelection(random_state=42)
# x_train_sm, y_train_sm = oss.fit_resample(x_train_sm, y_train_sm)

print(len(x_train_sm), len(y_train_sm))

print(Counter(y_train_sm))

Counter({False: 5086350, True: 188269})
5274619 5274619
Counter({False: 5086350, True: 188269})


In [None]:
df_val_final = df_val_processed.drop(['Station','measure','target'],axis = 1)
print(len(x_train_sm))
print(len(y_train_sm))
print(len(df_val_final))
print(len(y_val))

5274619
5274619
1318655
1318655


In [None]:
x_train_sm, df_val_final = x_train_sm.align(df_val_final, join='inner', axis=1)  # inner join

# Random Forest (10-100)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report
for i in range(10,101,20):
    print("Value of i", i)
    random_forst_model = RandomForestClassifier(n_estimators=i, random_state=1)
    saved_model_RF = 'random_forest_model_20.sav'
    # pickle.dump(random_forst_model, open(saved_model_RF, 'wb'))
    random_forst_model.fit(x_train_sm, y_train_sm)
    y_pred = random_forst_model.predict(df_val_final)
    print('F1 Score:', metrics.f1_score(y_val, y_pred, average='macro'))
    print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
    print('Precision score: ', metrics.precision_score(y_val, y_pred, average='macro'))
    print('Recall score: ', metrics.recall_score(y_val, y_pred, average='macro'))
    print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
    print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred))
    print(classification_report(y_val, y_pred, zero_division = 1))
    print("------------------------------------------------------------------------")

Value of i 10
F1 Score: 0.9946021118609605
Accuracy Score: 0.9992613685914815
Precision score:  0.9979885171832589
Recall score:  0.9912650443761564
Accuracy Score: 0.9992613685914815
Confusion Matrix: [[1271399     157]
 [    817   46282]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1271556
        True       1.00      0.98      0.99     47099

    accuracy                           1.00   1318655
   macro avg       1.00      0.99      0.99   1318655
weighted avg       1.00      1.00      1.00   1318655

------------------------------------------------------------------------
Value of i 30
F1 Score: 0.9952953245129154
Accuracy Score: 0.9993554038016009
Precision score:  0.9980417261037201
Recall score:  0.9925814205200235
Accuracy Score: 0.9993554038016009
Confusion Matrix: [[1271399     157]
 [    693   46406]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1271556
        T

# Random Forest (100)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report
random_forst_model = RandomForestClassifier(n_estimators=100, random_state=1)
saved_model_RF = 'random_forest_model_20.sav'
# pickle.dump(random_forst_model, open(saved_model_RF, 'wb'))
random_forst_model.fit(x_train_sm, y_train_sm)
y_pred = random_forst_model.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
print('Precision score: ', metrics.precision_score(y_val, y_pred, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred, zero_division = 1))
print("------------------------------------------------------------------------")

F1 Score: 0.995768732133641
Accuracy Score: 0.9994221384668469
Precision score:  0.9981239156426818
Recall score:  0.9934374575829741
Accuracy Score: 0.9994221384668469
Confusion Matrix: [[1271600     152]
 [    610   46293]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1271752
        True       1.00      0.99      0.99     46903

    accuracy                           1.00   1318655
   macro avg       1.00      0.99      1.00   1318655
weighted avg       1.00      1.00      1.00   1318655

------------------------------------------------------------------------


# Random Forest (200)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report
random_forst_model = RandomForestClassifier(n_estimators=200, random_state=1)
saved_model_RF = 'random_forest_model_20.sav'
# pickle.dump(random_forst_model, open(saved_model_RF, 'wb'))
random_forst_model.fit(x_train_sm, y_train_sm)
y_pred = random_forst_model.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
print('Precision score: ', metrics.precision_score(y_val, y_pred, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred, zero_division = 1))
print("------------------------------------------------------------------------")

F1 Score: 0.9956442978590581
Accuracy Score: 0.9994046964520666
Precision score:  0.998002131968464
Recall score:  0.993310433978589
Accuracy Score: 0.9994046964520666
Confusion Matrix: [[1271552     163]
 [    622   46318]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1271715
        True       1.00      0.99      0.99     46940

    accuracy                           1.00   1318655
   macro avg       1.00      0.99      1.00   1318655
weighted avg       1.00      1.00      1.00   1318655

------------------------------------------------------------------------


# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import pickle
from sklearn.metrics import classification_report
gradient_boosting_model = GradientBoostingClassifier(n_estimators=300, random_state=1, learning_rate=1.0,)
# pickle.dump(random_forst_model, open(saved_model_RF, 'wb'))
gradient_boosting_model.fit(x_train_sm, y_train_sm)
y_pred = gradient_boosting_model.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
print('Precision score: ', metrics.precision_score(y_val, y_pred, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred, zero_division = 1))
print("------------------------------------------------------------------------")

F1 Score: 0.9635301658174719
Accuracy Score: 0.994645301462475
Precision score:  0.9368048104738054
Recall score:  0.9940953289290946
Accuracy Score: 0.994645301462475
Confusion Matrix: [[1264801    6755]
 [    306   46793]]
              precision    recall  f1-score   support

       False       1.00      0.99      1.00   1271556
        True       0.87      0.99      0.93     47099

    accuracy                           0.99   1318655
   macro avg       0.94      0.99      0.96   1318655
weighted avg       1.00      0.99      0.99   1318655

------------------------------------------------------------------------


# Balanced Random Forest Classifier

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
import pickle
from sklearn.metrics import classification_report
BalancedRandomForestClassifier_model = BalancedRandomForestClassifier()
# pickle.dump(random_forst_model, open(saved_model_RF, 'wb'))
BalancedRandomForestClassifier_model.fit(x_train_sm, y_train_sm)
y_pred = BalancedRandomForestClassifier_model.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
print('Precision score: ', metrics.precision_score(y_val, y_pred, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred, zero_division = 1))
print("------------------------------------------------------------------------")

F1 Score: 0.9726799758199793
Accuracy Score: 0.9960558296142661
Precision score:  0.950805365312211
Recall score:  0.997001062057814
Accuracy Score: 0.9960558296142661
Confusion Matrix: [[1266607    5108]
 [     93   46847]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1271715
        True       0.90      1.00      0.95     46940

    accuracy                           1.00   1318655
   macro avg       0.95      1.00      0.97   1318655
weighted avg       1.00      1.00      1.00   1318655

------------------------------------------------------------------------


# Random Forest (1-9)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report
for i in range(1,10,2):
    print("Value of i", i)
    random_forst_model = RandomForestClassifier(n_estimators=i, random_state=1)
    saved_model_RF = 'random_forest_model_20.sav'
    # pickle.dump(random_forst_model, open(saved_model_RF, 'wb'))
    random_forst_model.fit(x_train_sm, y_train_sm)
    y_pred = random_forst_model.predict(df_val_final)
    print('F1 Score:', metrics.f1_score(y_val, y_pred, average='macro'))
    print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
    print('Precision score: ', metrics.precision_score(y_val, y_pred, average='macro'))
    print('Recall score: ', metrics.recall_score(y_val, y_pred, average='macro'))
    print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
    print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred))
    print(classification_report(y_val, y_pred, zero_division = 1))
    print("------------------------------------------------------------------------")

Value of i 1
F1 Score: 0.9892799818057991
Accuracy Score: 0.9985242538799004
Precision score:  0.9896507187945156
Recall score:  0.9889098509898371
Accuracy Score: 0.9985242538799004
Confusion Matrix: [[1270620     936]
 [   1010   46089]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1271556
        True       0.98      0.98      0.98     47099

    accuracy                           1.00   1318655
   macro avg       0.99      0.99      0.99   1318655
weighted avg       1.00      1.00      1.00   1318655

------------------------------------------------------------------------
Value of i 3
F1 Score: 0.9930924402156509
Accuracy Score: 0.999052064414119
Precision score:  0.9950241287628352
Recall score:  0.9911769613628018
Accuracy Score: 0.999052064414119
Confusion Matrix: [[1271121     435]
 [    815   46284]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1271556
        True 

# KNN

In [None]:
#This code is commented as it took too much of time to run

# from sklearn.neighbors import KNeighborsClassifier
# import pickle
# from sklearn.metrics import classification_report

# KNN_Classifier_1 = KNeighborsClassifier(n_neighbors=1)
# KNN_Classifier_1.fit(x_train_sm, y_train_sm)
# # saved_model_KNN = 'KNN_smote.sav'
# # pickle.dump(KNN_Classifier_1, open(saved_model_KNN, 'wb'))
# y_pred_KNN = KNN_Classifier_1.predict(df_val_final)
# print('F1 Score:', metrics.f1_score(y_val, y_pred_KNN, average='macro'))
# print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_KNN))
# print('Precision score: ', metrics.precision_score(y_val, y_pred_KNN, average='macro'))
# print('Recall score: ', metrics.recall_score(y_val, y_pred_KNN, average='macro'))
# print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_KNN))
# print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred_KNN))
# print(classification_report(y_val, y_pred_KNN, zero_division = 1))

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

reg = LogisticRegression(random_state=0).fit(x_train_sm, y_train_sm)
y_pred_KNN = reg.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred_KNN, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_KNN))
print('Precision score: ', metrics.precision_score(y_val, y_pred_KNN, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred_KNN, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_KNN))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred_KNN))
print(classification_report(y_val, y_pred_KNN, zero_division = 1))

F1 Score: 0.7968016700911662
Accuracy Score: 0.9787798931487007
Precision score:  0.9435570886636494
Recall score:  0.7257940802275427
Accuracy Score: 0.9787798931487007
Confusion Matrix: [[1269288    2192]
 [  25790   21385]]
              precision    recall  f1-score   support

       False       0.98      1.00      0.99   1271480
        True       0.91      0.45      0.60     47175

    accuracy                           0.98   1318655
   macro avg       0.94      0.73      0.80   1318655
weighted avg       0.98      0.98      0.98   1318655



# SVM

In [None]:
from sklearn import svm
clf = svm.SVC(C=10, kernel='rbf', gamma = 1, class_weight='balanced',max_iter=1000)
clf.fit(x_train_sm, y_train_sm)
y_pred_KNN = clf.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred_KNN, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_KNN))
print('Precision score: ', metrics.precision_score(y_val, y_pred_KNN, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred_KNN, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_KNN))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred_KNN))
print(classification_report(y_val, y_pred_KNN, zero_division = 1))

F1 Score: 0.5593981627162983
Accuracy Score: 0.9664756892439645
Precision score:  0.8918449442776004
Recall score:  0.5367485354644445
Accuracy Score: 0.9664756892439645
Confusion Matrix: [[1270972     780]
 [  43427    3476]]
              precision    recall  f1-score   support

       False       0.97      1.00      0.98   1271752
        True       0.82      0.07      0.14     46903

    accuracy                           0.97   1318655
   macro avg       0.89      0.54      0.56   1318655
weighted avg       0.96      0.97      0.95   1318655



# XGBoost

In [None]:
from xgboost import XGBClassifier
# XGBClassifier(learning_rate=1, n_estimators=1300,eta = 0.7, max_depth= 3,  objective= 'multi:softprob',  num_class= 3)
XGB_model = XGBClassifier(learning_rate=1, n_estimators=100)
XGB_model.fit(x_train_sm, y_train_sm)
# pickle.dump(XGB_model, open(saved_model_XGB, 'wb'))
y_pred_XGB = XGB_model.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred_XGB, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_XGB))
print('Precision score: ', metrics.precision_score(y_val, y_pred_XGB, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred_XGB, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_XGB))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred_XGB))
print(classification_report(y_val, y_pred_XGB, zero_division = 1))

F1 Score: 0.6537817271274008
Accuracy Score: 0.9228661022026231
Precision score:  0.6165591188691671
Recall score:  0.7566951059807931
Accuracy Score: 0.9228661022026231
Confusion Matrix: [[1189731   81825]
 [  19888   27211]]
              precision    recall  f1-score   support

       False       0.98      0.94      0.96   1271556
        True       0.25      0.58      0.35     47099

    accuracy                           0.92   1318655
   macro avg       0.62      0.76      0.65   1318655
weighted avg       0.96      0.92      0.94   1318655



# Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest
df_val_final = df_val_processed.drop(['Station','measure','target'],axis = 1)

# XGBClassifier(learning_rate=1, n_estimators=1300,eta = 0.7, max_depth= 3,  objective= 'multi:softprob',  num_class= 3)
df_train_final, df_val_final = df_train_final.align(df_val_final, join='inner', axis=1)  # inner join

isolation_model=IsolationForest(n_estimators=100,max_samples='auto',contamination=0.03, random_state=42)

isolation_model.fit(df_train_final, y_train)
# pickle.dump(XGB_model, open(saved_model_XGB, 'wb'))
y_pred_XGB = isolation_model.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred_XGB, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_XGB))
print('Precision score: ', metrics.precision_score(y_val, y_pred_XGB, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred_XGB, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_XGB))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred_XGB))
print(classification_report(y_val, y_pred_XGB, zero_division = 1))

F1 Score: 0.01982873968452153
Accuracy Score: 0.029916088741937807
Precision score:  0.010279055418150588
Recall score:  0.27944718349767655
Accuracy Score: 0.029916088741937807
Confusion Matrix: [[      0       0       0]
 [  31780       0 1239819]
 [   7607       0   39449]]
              precision    recall  f1-score   support

          -1       0.00      1.00      0.00         0
           0       1.00      0.00      0.00   1271599
           1       0.03      0.84      0.06     47056

    accuracy                           0.03   1318655
   macro avg       0.34      0.61      0.02   1318655
weighted avg       0.97      0.03      0.00   1318655



# AdaBoost

In [None]:
# AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier
ada_model = AdaBoostClassifier(n_estimators=100, random_state=1)
ada_model.fit(x_train_sm, y_train_sm)
# saved_model_XGB = 'XGB_smote.sav'
# pickle.dump(XGB_model, open(saved_model_XGB, 'wb'))
y_pred_XGB = ada_model.predict(df_val_final)
print('F1 Score:', metrics.f1_score(y_val, y_pred_XGB, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_XGB))
print('Precision score: ', metrics.precision_score(y_val, y_pred_XGB, average='macro'))
print('Recall score: ', metrics.recall_score(y_val, y_pred_XGB, average='macro'))
print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred_XGB))
print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred_XGB))
print(classification_report(y_val, y_pred_XGB, zero_division = 1))

F1 Score: 0.8520582136000665
Accuracy Score: 0.9817101516317763
Precision score:  0.8977469866292793
Recall score:  0.8164118370594116
Accuracy Score: 0.9817101516317763
Confusion Matrix: [[1264496    7103]
 [  17015   30041]]
              precision    recall  f1-score   support

       False       0.99      0.99      0.99   1271599
        True       0.81      0.64      0.71     47056

    accuracy                           0.98   1318655
   macro avg       0.90      0.82      0.85   1318655
weighted avg       0.98      0.98      0.98   1318655



# Predicting labels on test.csv

The attribute I will predict is: target

In [None]:
X_train = df_train[['value', 'R_flag', 'I_flag', 'Z_flag', 'B_flag','measure','Station','Month','Min','Day','Hour','target']]
Y_train = df_train["target"]

X_test = df_test[['value', 'R_flag', 'I_flag', 'Z_flag', 'B_flag','measure','Station','Month','Min','Day','Hour']]


In [None]:
#Applying One Hot Encoding on Test Dataset
one_hot_station = pd.get_dummies(X_train['Station'])
one_hot_measure = pd.get_dummies(X_train['measure'])

X_train_processed = X_train.join(one_hot_station)
X_train_processed = X_train_processed.join(one_hot_measure)

one_hot_station = pd.get_dummies(X_test['Station'])
one_hot_measure = pd.get_dummies(X_test['measure'])

X_test_processed = X_test.join(one_hot_station)
X_test_processed = X_test_processed.join(one_hot_measure)

In [None]:
#Dropping unused and predicted attributes
X_train_final = X_train_processed.drop(['Station','measure','target'],axis = 1)
X_test_final = X_test_processed.drop(['Station','measure'],axis = 1)

In [None]:
#Same sampling technique is applied which was used during training the model.

from imblearn.over_sampling import SMOTE 
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler 
# from imblearn.under_sampling import     OneSidedSelection

print(Counter(Y_train))


# x_train_sm = df_train_final
# y_train_sm = y_train



# rus = RandomUnderSampler(random_state=42, sampling_strategy = 0.1)
# X_train_final, Y_train = rus.fit_resample(X_train_final, Y_train)
# print(Counter(Y_train))

# oss = OneSidedSelection(random_state=42, n_jobs=4)
# ncl = NeighbourhoodCleaningRule(n_neighbors=1)

# X_train_final, Y_train = oss.fit_resample(X_train_final, Y_train)
# X_train_temp, Y_train_temp = ncl.fit_resample(X_train_final, Y_train)

print(len(X_train_final), len(Y_train))

print(Counter(Y_train))

Counter({False: 6358102, True: 235172})
6593274 6593274
Counter({False: 6358102, True: 235172})


In [None]:
X_train_final, X_test_final = X_train_final.align(X_test_final, join='inner', axis=1)  # inner join

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report

random_forst_model = RandomForestClassifier(n_estimators=100, random_state=1)
saved_model_RF = 'random_forest_model_20.sav'
# pickle.dump(random_forst_model, open(saved_model_RF, 'wb'))
random_forst_model.fit(X_train_final, Y_train)
y_pred_random = random_forst_model.predict_proba(X_test_final)
# print('F1 Score:', metrics.f1_score(y_val, y_pred, average='macro'))
# print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
# print('Precision score: ', metrics.precision_score(y_val, y_pred, average='macro'))
# print('Recall score: ', metrics.recall_score(y_val, y_pred, average='macro'))
# print('Accuracy Score:', metrics.accuracy_score(y_val, y_pred))
# print('Confusion Matrix:', metrics.confusion_matrix(y_val, y_pred))
# print(classification_report(y_val, y_pred, zero_division = 1))

In [None]:
df_test_target_True = 0
df_test_target_False = 0

y_pred_final =np.round(y_pred_random[:,1], 0)
# y_pred_balanced1 =np.round(y_pred_balanced, 0)

# y_pred_final = (y_pred_random1 + y_pred_balanced1)/2
for x in y_pred_final:
  if int(x) == 1:
    df_test_target_True = df_test_target_True + 1
  else:
    df_test_target_False = df_test_target_False + 1

print((df_test_target_True))
print((df_test_target_False))

62844
1793262


In [None]:
y_pred_final

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
62844/(62844+1793262)
62844+1793262

1856106

In [None]:
pd.DataFrame(y_pred_random[:,1], columns=['target']).to_csv('/content/drive/MyDrive/project_ALDA/ECONet/predictions.csv', index=False)