In [20]:
from Ks import *
import pickle as pkl

# load data
f = open("data.pickle", "rb")
data = pkl.load(f)

In [21]:
import numpy as np

features = []
labels = []
state = []

for i in data:
    tmp = [i.bigCategory, i.day, i.goal, i.backers]
    features.append(tmp)
    labels.append(i.pleged)
    state.append(i.state)

X = np.array(features)
y = np.array(labels)
y_state = np.array(state)

In [22]:
from sklearn.preprocessing import OneHotEncoder, normalize

# one hot encode category
category = X[:, [0]]
enc = OneHotEncoder(handle_unknown='ignore')
category = enc.fit_transform(category).toarray()

In [23]:
# max normalize numeric features
d = X[:, [1, 3]]
d = normalize(d, axis=0, norm="max")

goal = X[:, 2].reshape(1, -1)
y = np.concatenate((y, np.squeeze(goal)), axis=0)

y = normalize(y.reshape(-1, 1), axis=0, norm="max").reshape(1, -1)
y = np.squeeze(y)

mid = int(len(y) / 2)
goal = y[mid:]
y = y[:mid]

In [24]:
X = np.concatenate((category, d, goal.reshape(-1, 1)), axis=1)

In [25]:
from sklearn.model_selection import KFold, train_test_split, StratifiedShuffleSplit

# K-fold cross validation
kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(X)

s = StratifiedShuffleSplit(test_size=0.2, random_state=11)

In [28]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

best_acc = 0

# use odd K to avoid tie
for i in range(11, 20, 2):

    for j, [train_index, test_index] in enumerate(s.split(X, y_state)):
    # for j, [train_index, test_index] in enumerate(kf.split(X)):
        X_train = X[train_index, :]
        y_train = y[train_index]
        y_train_state = y_state[train_index]

        X_test = X[test_index, :]
        y_test = y[test_index]
        y_test_state = y_state[test_index]

        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        print("Building on K={} fold={}".format(i, j))
        neigh = NearestNeighbors(n_neighbors=i)
        neigh.fit(X_train)

        print("Testing...")
        n = neigh.kneighbors(X_test, i, return_distance=False)
        pledge_predict = []
        state_predict = []
        state_predict_plege = []

        for inx, nbrs in enumerate(n):
            nbrPlege = y_train[nbrs]
            p = np.mean(nbrPlege)
            pledge_predict.append(p)         
            goal = X_test[inx][-1]

            # goal = X_test[inx][2]
            # state = X_test[inx][2] >= y_test[inx]
            # actual_state.append(state)

            if p >= goal:
                state_predict_plege.append(True)
            else:
                state_predict_plege.append(False)

            nbrState = y_train_state[nbrs]
            nbrSuc = len(list(filter(lambda x: x, nbrState)))

            if nbrSuc > i / 2:
                state_predict.append(True)
            else:
                state_predict.append(False)
        
        RMSE = np.sqrt((np.square(pledge_predict - y_test).mean(axis=0)))
        tn, fp, fn, tp = confusion_matrix(y_test_state, state_predict).ravel()
        acc1 = accuracy_score(y_test_state, state_predict)
        acc2 = accuracy_score(y_test_state, state_predict_plege)
        
        print("           Positive       Negative")
        print("Positive     {}             {}".format(tp, fp))
        print("Negative     {}             {}".format(fn, tn))

        print(classification_report(y_test_state, state_predict))
        print("Accuracy predicted by neighbor state: {}".format(acc1))
        print("Accuracy predicted by neighbor pleged: {}".format(acc2))
        print("RMSE: {}".format(RMSE))

        if acc > best_acc:
            # save best classifier
            clf = open("KNN_clf.pkl", "wb")
            pkl.dump(neigh, clf)
            clf.close()
        break
    break

Building on K=11 fold=0
Testing...
           Positive       Negative
Positive     37534             4279
Negative     3512             63891
              precision    recall  f1-score   support

       False       0.95      0.94      0.94     68170
        True       0.90      0.91      0.91     41046

    accuracy                           0.93    109216
   macro avg       0.92      0.93      0.92    109216
weighted avg       0.93      0.93      0.93    109216

Accuracy predicted by neighbor state: 0.9286642982713156
Accuracy predicted by neighbor pleged: 0.8843942279519484
RMSE: 0.0005264547723505586
