In [1]:
import pandas as pd
import numpy as np
import copy
from sklearn.neighbors import KNeighborsClassifier, DistanceMetric
import sklearn
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('amazon.csv')

In [3]:
max_value = 0
for col_name in data.columns:
    max_value = max(max_value, np.max(data[col_name]))
print(max_value)

312153


In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0],
                                                    test_size=0.3, random_state=241)



In [5]:
def display_auc(y_true, y_predict):
    print("AUC:", sklearn.metrics.roc_auc_score(y_score=y_predict, y_true=y_true))
    (fpr, tpr, thresholds) = sklearn.metrics.roc_curve(y_true=y_true, y_score=y_predict)
    plt.plot(fpr, tpr)
    plt.show()

def get_auc(y_true, y_predict):
    return sklearn.metrics.roc_auc_score(y_score=y_predict, y_true=y_true)

In [None]:
#indicator 22 min
neigh_with_indicator = sklearn.neighbors.KNeighborsClassifier(n_neighbors=10, algorithm='brute', metric=indicator)
neigh_with_indicator.fit(X_train, y_train)
y_with_indicator = neigh_with_indicator.predict_proba(X_test)

y_with_indicator = y_with_indicator[:, 1]

display_auc(y_true = y_test, y_predict = y_with_indicator)

In [6]:
def unique_with_count(input_array): # (value, count)
    buf = copy.deepcopy(input_array)
    buf = buf[buf == buf]
    buf = np.sort(buf)
    
    Y = np.concatenate((buf[1:], np.array([np.nan])))
    lens_seg = np.cumsum(np.ones(buf.shape[0]))
    val = buf[buf != Y]
    lens_seg = lens_seg[buf != Y]
    lens_seg = np.asarray(np.concatenate((lens_seg[0:1], np.diff(lens_seg))), dtype = int)
    return (val, lens_seg)


p_2 = np.zeros((X_train.shape[1], max_value + 1))
p = np.zeros((X_train.shape[1], max_value + 1))
f = np.zeros((X_train.shape[1], max_value + 1))
for col in range(X_train.shape[1]):
    (unique_value, count) = unique_with_count(X_train.iloc[:, col])
    cur_f = count
    cur_p = (count + 0.0) / X_train.shape[0]
    cur_p_2 = count * (count - 1.0) / X_train.shape[0] / (X_train.shape[0] - 1)
    sum_p_2 = np.zeros(unique_value.shape[0])
    for i in range(unique_value.shape[0]):
        sum_p_2[i] = np.sum(cur_p_2[cur_p < cur_p[i]])
    
    f[col][unique_value] = cur_f
    p[col][unique_value] = cur_p
    p_2[col][unique_value] = sum_p_2
    
log_f = np.log(f + 1)


In [7]:
def smoothed_indicator(x, y, **kwargs):
    buf = kwargs['p_2']
    return np.sum((x != y) + (x == y) * buf[:, np.asarray(x, int)])


In [None]:
#smoothed_indicator
neigh_with_smoothed_indicator = sklearn.neighbors.KNeighborsClassifier(n_neighbors=10, algorithm='brute', metric=smoothed_indicator, 
                                                                       metric_params={'p_2': p_2})
neigh_with_smoothed_indicator.fit(X_train, y_train)


y_with_smoothed_indicator = neigh_with_smoothed_indicator.predict_proba(np.asarray(X_test, int))
y_with_smoothed_indicator = y_with_smoothed_indicator[:, 1]

display_auc(y_true = y_test, y_predict = y_with_smoothed_indicator)

In [8]:
def log_indicator(x, y, **kwargs):
    buf = kwargs['log_f']
    return np.sum((x != y) * buf[:, np.asarray(x, int)] * buf[:, np.asarray(y, int)])

In [None]:
#log_indicator
neigh_with_log_indicator = sklearn.neighbors.KNeighborsClassifier(n_neighbors=10, algorithm='brute', 
                                                              metric=log_indicator,
                                                             metric_params={'log_f': log_f})
neigh_with_log_indicator.fit(X_train, y_train)
y_with_log_indicator = neigh_with_log_indicator.predict_proba(np.asarray(X_test, int))

y_with_log_indicator = y_with_log_indicator[:, 1]
display_auc(y_true = y_test, y_predict = y_with_log_indicator)

In [9]:
def find_optimal_k(X_train, y_train, X_test, y_test, MAXK, metric, metric_params = None):
    MAXK += 1
    classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=MAXK, 
                                                        algorithm='brute', 
                                                        metric=metric,
                                                        metric_params=metric_params)
    classifier.fit(X_train, y_train)
    nearest_class = classifier.kneighbors(X_test, return_distance=0)
    precision = np.ndarray((MAXK, X_test.shape[0], 2))
    for ind_object in range(X_test.shape[0]):
        histohram = np.zeros(2)
        for k in range(0, MAXK):
            histohram[y_train.iloc[nearest_class[ind_object][k]]] += 1
            precision[k, ind_object] = histohram / (k + 1)
    
    max_auc = 0.0
    max_k = 0;
    for k in range(0, MAXK):
        y_predict = precision[k, :, 1]
        cur_auc = get_auc(y_true = y_test, y_predict = y_predict)
        if (cur_auc > max_auc):
            max_auc = cur_auc
            max_k = k + 1
        
    return (max_auc, max_k)

In [None]:
print( find_optimal_k(X_train, y_train, X_test, y_test, 15, log_indicator, metric_params={'log_f': log_f}) )

In [None]:
print( find_optimal_k(X_train, y_train, X_test, y_test, 15, smoothed_indicator, metric_params={'p_2': p_2}) )

In [None]:
print( find_optimal_k(X_train, y_train, X_test, y_test, 15, indicator) )