In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt 
import pandas_profiling as pp
import seaborn as sns

# Read data

In [None]:
data_file_path = "../input/predict-west-nile-virus/train.csv.zip"
data = pd.read_csv(data_file_path)
data.head()

In [None]:
def create_year(data):
    return data.split('-')[0]

def create_month(data):
    return data.split('-')[1]

def create_day(data):
    return data.split('-')[2]

# Split date to day, month, year. Drop date column

In [None]:
data['day'] = data.Date.apply(create_day)
data['month'] = data.Date.apply(create_month)
data['year'] = data.Date.apply(create_year)
data = data.drop(['Date'], axis = 1)
data.head()

# Drop address and adrressNumberAndStreet - won't use those

In [None]:
data = data.drop(['Address', 'AddressNumberAndStreet'], axis = 1)
data.head()

# Assign numeric values to categorial features

In [None]:
lbl = LabelEncoder()
lbl.fit(list(data['Species'].values))
data['Species'] = lbl.transform(data['Species'].values)
lbl.fit(list(data['Street'].values))
data['Street'] = lbl.transform(data['Street'].values)
lbl.fit(list(data['Trap'].values))
data['Trap'] = lbl.transform(data['Trap'].values)
data.head()

In [None]:
pp.ProfileReport(data, minimal = True)

# WNVpresent is our target, everything else is a feature

In [None]:
target = data.WnvPresent.values
features = data.drop(['WnvPresent'], axis = 1)
sns.histplot(target)

# SMOTE - Synthetic Minority Oversampling Technique
An algorithm to balance out the data

In [None]:
smote = SMOTE()
x_smote ,y_smote = smote .fit_resample(features, target)
sns.histplot(y_smote)

# Split the data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42, shuffle = True)
xs_train ,xs_test ,ys_train ,ys_test = train_test_split(x_smote ,y_smote , test_size = 0.2 , random_state = 42 ,shuffle = True) 

# Random forest, different criterions and #estimators
## Using the original dataset

In [None]:
criterions = ['gini', 'entropy']
best_score = -1
best_pred = []
best_forest = None
all_results = []
for criterion in criterions:
    print("Using", criterion)
    for estimators in range(10, 201, 10):
        print("\t{} estimators".format(estimators), end = " ")
        forest = RandomForestClassifier(n_estimators=estimators, criterion = criterion, random_state = 42)
        forest.fit(x_train, y_train)
        y_pred = forest.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        all_results.append([forest, y_pred, score, estimators])
        print("score = {}".format(score))
        if score > best_score:
            best_pred = y_pred
            best_score = score
            best_forest = forest
all_results = np.array(all_results, dtype=object)

## Using data after oversampling

In [None]:
criterions = ['gini', 'entropy']
best_score_smote = -1
best_pred_smote = []
best_forest_smote = None
all_results_smote = []
for criterion in criterions:
    print("Using", criterion)
    for estimators in range(10, 201, 10):
        print("\t{} estimators".format(estimators), end = " ")
        forest = RandomForestClassifier(n_estimators=estimators, criterion = criterion, random_state = 42)
        forest.fit(xs_train, ys_train)
        ys_pred = forest.predict(xs_test)
        score = accuracy_score(ys_test, ys_pred)
        all_results_smote.append([forest, ys_pred, score, estimators])

        print("score = {}".format(score))
        if score > best_score_smote:
            best_pred_smote = ys_pred
            best_score_smote = score
            best_forest_smote = forest
all_results_smote = np.array(all_results_smote, dtype=object)

In [None]:
plt.plot(all_results[:20][:, 3], all_results[:20][:, 2], label = "Gini")
plt.plot(all_results[20:][:, 3], all_results[20:][:, 2], label = 'Entropy')
plt.plot(all_results_smote[:20][:, 3], all_results_smote[:20][:, 2], label = "Gini - smote")
plt.plot(all_results_smote[20:][:, 3], all_results_smote[20:][:, 2], label = 'Entropy - smote')
plt.title("Scores on random forest by number of estimators(using smote)")
plt.ylabel("score")
plt.xlabel("number of estimators")
plt.legend()
plt.show()

In [None]:
print("\t\tOriginal data")
print("Best score using criterion={} with {} estimators".format(best_forest.criterion, len(best_forest
                                                                                         .estimators_)))
cm = confusion_matrix(y_test,best_pred)
print(classification_report(y_test,best_pred))
print("Accuracy score", accuracy_score(y_test, best_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

In [None]:
print("\t\tafter smote")

print("Best score using criterion={} with {} estimators".format(best_forest_smote.criterion, len(best_forest_smote
                                                                                         .estimators_)))
cm = confusion_matrix(ys_test,best_pred_smote)
print(classification_report(ys_test,best_pred_smote))
print("Accuracy score", accuracy_score(ys_test, best_pred_smote))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')


# KNN - different K's
## Original dataset

In [None]:
best_score = -1
best_pred = []
best_knn = None
all_results = []
for p in [1, 2]:
    print("Using l{}".format(p))
    for k in range(2, 31):
        print("\tUsing k={}".format(k), end = ' ')
        knn = KNeighborsClassifier(n_neighbors=k, p = p)
        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        all_results.append([k, score])
        print("score = {}".format(score))
        if score > best_score:
            best_pred = y_pred
            best_score = score
            best_knn = knn
all_results = np.array(all_results, dtype=object)

## Using data after oversampling

In [None]:
best_score_smote = -1
best_pred_smote = []
best_knn_smote = None
all_results_smote = []
for p in [1, 2]:
    print("Using l{}".format(p))
    for k in range(2, 31):
        print("\tUsing k={}".format(k), end = ' ')
        knn = KNeighborsClassifier(n_neighbors=k, p = p)
        knn.fit(xs_train, ys_train)
        ys_pred = knn.predict(xs_test)
        score = accuracy_score(ys_test, ys_pred)
        all_results_smote.append([k, score])
        print("score = {}".format(score))
        if score > best_score_smote:
            best_pred_smote = ys_pred
            best_score_smote = score
            best_knn_smote = knn
all_results_smote = np.array(all_results_smote, dtype=object)

In [None]:
plt.plot(all_results[:29][:, 0], all_results[:29][:, 1], label = "l1")
plt.plot(all_results[29:][:, 0], all_results[29:][:, 1], label = 'l2')
plt.plot(all_results_smote[:29][:, 0], all_results_smote[:29][:, 1], label = 'l1 - smote')
plt.plot(all_results_smote[29:][:, 0], all_results_smote[29:][:, 1], label = 'l2 - smote')
plt.title("Scores on KNN by k value")
plt.ylabel("score")
plt.xticks(range(2, 31))
plt.xlabel("k")
plt.legend()
plt.show()

In [None]:
print("\t\tOriginal data")
print("Best score using l{} with {} neighbors".format(best_knn.p, best_knn.n_neighbors))
cm = confusion_matrix(y_test,best_pred)
print(classification_report(y_test,best_pred))
print("Accuracy score", accuracy_score(y_test, best_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

In [None]:
print("\t\tAfter smote")
print("Best score using l{} with {} neighbors".format(best_knn_smote.p, best_knn_smote.n_neighbors))
cm = confusion_matrix(ys_test,best_pred_smote)
print(classification_report(ys_test,best_pred_smote))
print("Accuracy score", accuracy_score(ys_test, best_pred_smote))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

# SVM
## Original data

In [None]:
kernels = ['rbf', 'poly', 'sigmoid']
best_svm = None
best_score = -1
all_results = []
for kernel in kernels:
    print("Using kernel", kernel)
    for gamma in[0.0001, 0.001]: ## Bigger number makes the code take forever to run.
        print("\tUsing gamma = {}".format(gamma), end = " ")
        svc =  SVC(kernel=kernel, gamma=gamma, probability = True)
        svc.fit(x_train, y_train)
        y_pred = svc.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        all_results.append([gamma, score, kernel])
        print("score = {}".format(score))
        if score > best_score:
            best_pred = y_pred
            best_score = score
            best_svm = svc
all_results = np.array(all_results, dtype=object)

In [None]:
print("Best score using kernel={} with gamma = {}".format(best_svm.kernel, best_svm.gamma))
cm = confusion_matrix(y_test,best_pred)
print(classification_report(y_test,best_pred))
print("Accuracy score", accuracy_score(y_test, best_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

## Not using data after oversampling
it takes forever to run + the results are really, really bad. ~0.65 

In [None]:
# kernels = ['rbf', 'poly', 'sigmoid']
# best_svm_smote = None
# best_score_smote = 0
# all_results = []
# for kernel in kernels:
#     print("Using kernel", kernel)
#     for gamma in[0.0001, 0.001, 0.01, 0.1]:
#         print("\tUsing gamma = {}".format(gamma), end = " ")
#         svc =  SVC(kernel=kernel, gamma=gamma, probability = True)
#         svc.fit(xs_train, ys_train)
#         ys_pred = svc.predict(xs_test)
#         score = accuracy_score(ys_test, ys_pred)
#         all_results.append([gamma, kernel, score])
#         print("score = {}".format(score))
#         if score > best_score_smote:
#             best_pred_smote = y_pred
#             best_score_smote = score
#             best_svm_smote = svc
# all_results = np.array(all_results, dtype=object)

In [None]:
# all_results[:3], all_results[3:6], all_results[6:]

In [None]:
# plt.plot(all_results[:3][:, 0], all_results[:3][:, 1], label = "gamma(RBF)")
# plt.plot(all_results[3:6][:, 0], all_results[3:6][:, 1], label = "gamma(Poly)")
# plt.plot(all_results[6:][:, 0], all_results[6:][:, 1], label = "gamma(RBF)")
# plt.title("Scores on SVM by C value(smote only)")
# plt.ylabel("score")
# plt.xticks(range(1, 4))
# plt.xlabel("C")
# plt.legend()
# plt.show()

# Logistic Regression

## Original Data

In [None]:
for iterations in range(100, 1001, 100):
    print("maximum {} iterations".format(iterations), end = " ")
    lr = LogisticRegression(random_state = 42, max_iter = iterations)
    lr.fit(x_train, y_train)
    y_pred = lr.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    print("score", score)
cm = confusion_matrix(y_test,y_pred)
print(classification_report(y_test,y_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

## data after oversampling

In [None]:
for iterations in range(100, 1001, 100):
    print("maximum {} iterations".format(iterations), end = " ")
    lr = LogisticRegression(random_state = 42, max_iter = iterations)
    lr.fit(xs_train, ys_train)
    ys_pred = lr.predict(xs_test)
    score = accuracy_score(ys_test, ys_pred)
    print("score", score)
print(len(ys_test), len(ys_pred))
cm = confusion_matrix(ys_test,ys_pred)
print(classification_report(ys_test,ys_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')

# Ensemble all models

In [None]:
ensemble = [best_forest_smote, best_knn, lr, best_svm]
pred = []
for model in ensemble:
    pred.append(model.predict_proba(x_test))
probs = sum(pred)/len(ensemble)
final_pred = [0 if p[0] > p[1] else 1 for p in probs]

In [None]:
cm = confusion_matrix(y_test,final_pred)
print(classification_report(y_test,final_pred))
print(accuracy_score(y_test, final_pred))
sns.heatmap(cm/np.sum(cm), annot=True, fmt = '.2%', cmap = 'Blues')