In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, precision_score, precision_recall_fscore_support

### Canceling the errors that would happen

In [None]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [None]:
data = pd.read_csv("data.csv")
data

### Splitting TimeStamp column into Month, Day, Hour, and Minute

In [None]:
data["Timestamp"] = pd.to_datetime(data["Timestamp"])
data["Month"] = data["Timestamp"].dt.month
data["Day"] = data["Timestamp"].dt.day
data["Hour"] = data["Timestamp"].dt.hour
data["Minute"] = data["Timestamp"].dt.minute
data

In [None]:
new_data = data
new_data = new_data.drop(["Unnamed: 0", "Timestamp"], axis=1)
new_data.columns

In [None]:
new_data = new_data.dropna()
new_data

In [None]:
new_data.columns

In [None]:
new_data

### Checking if there is any NaN values

In [None]:
new_data["y"].unique()
new_data.isnull().any()

In [None]:
new_data

In [None]:
new_data = new_data.dropna()

In [None]:



X = new_data.drop("y", axis=1)
y = new_data["y"]


### Selecting top 6 features based on highest correlation with the target column

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

features = ["R_VALUE", "USFLUX", "TOTFZ", "TOTUSJH", "TOTBSQ", "TOTUSJZ"]

X = new_data[features]
X = scaler.fit_transform(X)

y = new_data["y"]

### Define kfold with 5

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)


### Average is based on Weighted approach

In [None]:
precision_scorer = make_scorer(precision_score, average='weighted')
recall_scorer = make_scorer(recall_score, average='weighted')
fScore_scorer = make_scorer(f1_score, average='weighted')
accuracy_scorer = make_scorer(accuracy_score)

# Decision tree

In [None]:
dt = DecisionTreeClassifier(max_depth=10)


accuracy_results = cross_val_score(dt, X, y, cv=kfold)
precision_results = cross_val_score(dt, X, y, cv=kfold, scoring=precision_scorer)
recall_results = cross_val_score(dt, X, y, cv=kfold, scoring=recall_scorer)
fScore_results = cross_val_score(dt, X, y, cv=kfold, scoring=fScore_scorer)

print("Average accuracy: " + str(np.mean(accuracy_results)))
print("Average precision: " + str(np.mean(precision_results)))
print("Average recall: " + str(np.mean(recall_results)))
print("Average f_score: " + str(np.mean(fScore_results)))

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)



accuracy_results = cross_val_score(knn, X, y, cv=kfold)
precision_results = cross_val_score(knn, X, y, cv=kfold, scoring=precision_scorer)
recall_results = cross_val_score(knn, X, y, cv=kfold, scoring=recall_scorer)
fScore_results = cross_val_score(knn, X, y, cv=kfold, scoring=fScore_scorer)
print("Average accuracy: " + str(np.mean(accuracy_results)))
print("Average precision: " + str(np.mean(precision_results)))
print("Average recall: " + str(np.mean(recall_results)))
print("Average f_score: " + str(np.mean(fScore_results)))


# Naive bayes

In [None]:
naive_bayes = GaussianNB()



accuracy_results = cross_val_score(naive_bayes, X, y, cv=kfold)
precision_results = cross_val_score(naive_bayes, X, y, cv=kfold, scoring=precision_scorer)
recall_results = cross_val_score(naive_bayes, X, y, cv=kfold, scoring=recall_scorer)
fScore_results = cross_val_score(naive_bayes, X, y, cv=kfold, scoring=fScore_scorer)

print("Average accuracy: " + str(np.mean(accuracy_results)))
print("Average precision: " + str(np.mean(precision_results)))
print("Average recall: " + str(np.mean(recall_results)))
print("Average f_score: " + str(np.mean(fScore_results)))

# Logistic Regression

In [None]:
lr = LogisticRegression(solver="lbfgs", max_iter=1000)



accuracy_results = cross_val_score(lr, X, y, cv=kfold)
precision_results = cross_val_score(lr, X, y, cv=kfold, scoring=precision_scorer)
recall_results = cross_val_score(lr, X, y, cv=kfold, scoring=recall_scorer)
fScore_results = cross_val_score(lr, X, y, cv=kfold, scoring=fScore_scorer)

print("Average accuracy: " + str(np.mean(accuracy_results)))
print("Average precision: " + str(np.mean(precision_results)))
print("Average recall: " + str(np.mean(recall_results)))
print("Average f_score: " + str(np.mean(fScore_results)))

# Soft ensemble learning

### Used brute force to find the best weights

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score


# The commented code is brute force that calculate the best weight for each base learner based on the highest f-score macro
# fScore_scorer = make_scorer(f1_score, average='weighted')
# best_records =[0, 0, 0, 0, 0]
# for i in range(1, 5):
#     print(i)
#     for j in range(1, 5):
#         for k in range(1, 5):
#             for l in range(1, 5):
#                 ensemble_classifier = VotingClassifier(estimators=[
#                     ('lr', lg), 
#                     ('nv', naive_bayes), 
#                     ('knn', knn),
#                     ('dt', dt)], 
#                     voting='soft', weights=[i, j, k, l])
#                 fScore_results = cross_val_score(ensemble_classifier, X, y, cv=kfold, scoring=fScore_scorer)
#                 if np.mean(fScore_results) > best_records[0]:
#                     best_records[0] = np.mean(fScore_results)
#                     best_records[1] = i
#                     best_records[2] = j
#                     best_records[3] = k
#                     best_records[4] = l


# print(best_records)         


'''
After running the above code it looks that these weights are the best based on the highest f-score: [1, 1, 3, 2]

'''
ensemble_classifier = VotingClassifier(estimators=[
                    ('lr', lr), 
                    ('nv', naive_bayes), 
                    ('knn', knn),
                    ('dt', dt)], 
                    voting='soft', weights=[1, 1, 3, 2])


accuracy_results = cross_val_score(ensemble_classifier, X, y, cv=kfold)
print("Average accuracy: " + str(np.mean(accuracy_results)))
precision_results = cross_val_score(ensemble_classifier, X, y, cv=kfold, scoring=precision_scorer)
print("Average precision: " + str(np.mean(precision_results)))
recall_results = cross_val_score(ensemble_classifier, X, y, cv=kfold, scoring=recall_scorer)
print("Average recall: " + str(np.mean(recall_results)))
fScore_results = cross_val_score(ensemble_classifier, X, y, cv=kfold, scoring=fScore_scorer)
print("Average f_score: " + str(np.mean(fScore_results)))

print("Accuracy: " + str(accuracy_results))
print("Precision: " + str((precision_results)))
print("Recall: " + str((recall_results)))
print("F score: " + str((fScore_results)))




    
    
