In [5]:
import numpy as np
import pandas as pd
from svector import svector
from gensim.models import KeyedVectors
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [6]:
def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label=="+" else -1, words.split())
        
def make_vector(words):
    v = svector()
    v['<bias>'] = 1  
    for word in words:
        v[word] += 1
    return v

In [11]:
def train(trainfile, devfile):
    vectorizer = DictVectorizer(sparse = False)
    X_train, X_dev, y_train, y_dev = [], [], [], []


    # Train_data
    for label, words in read_from(trainfile):
        sent = make_vector(words)
        X_train.append(dict(sent))
        y_train.append(label)

    # Dev_data
    for label, words in read_from(devfile):
        sent = make_vector(words)
        X_dev.append(dict(sent))
        y_dev.append(label)

    X_train = vectorizer.fit_transform(X_train)
    X_dev = vectorizer.transform(X_dev)
    print(X_train.shape)
    print(X_dev.shape)
    
    k_values = list(range(1, 101, 2))  

    
    error_rates = []
    predicted_positive_rates = []


    for k in k_values:  

        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train,y_train)

        y_train_pred = knn.predict(X_train)
        y_dev_pred = knn.predict(X_dev)
        train_error_rate = 1 - accuracy_score(y_train, y_train_pred)
        dev_error_rate =  1 - accuracy_score(y_dev, y_dev_pred)

        y_train_pred = list(y_train_pred)
        y_dev_pred = list(y_dev_pred)

        print(f"k = {k}",end='\t')
        print(f"dev_err = {dev_error_rate*100:.2f}")
        
        

In [12]:
train('train.txt','dev.txt')

(8000, 15806)
(1000, 15806)
k = 1	dev_err = 42.80
k = 3	dev_err = 40.60
k = 5	dev_err = 42.00
k = 7	dev_err = 40.80
k = 9	dev_err = 40.30
k = 11	dev_err = 40.80
k = 13	dev_err = 41.90
k = 15	dev_err = 43.00
k = 17	dev_err = 44.10
k = 19	dev_err = 44.10
k = 21	dev_err = 43.70
k = 23	dev_err = 43.10
k = 25	dev_err = 43.40
k = 27	dev_err = 42.80
k = 29	dev_err = 44.40
k = 31	dev_err = 44.20
k = 33	dev_err = 44.40
k = 35	dev_err = 44.10
k = 37	dev_err = 43.60
k = 39	dev_err = 42.80
k = 41	dev_err = 43.00
k = 43	dev_err = 42.60
k = 45	dev_err = 43.40
k = 47	dev_err = 43.00
k = 49	dev_err = 41.90
k = 51	dev_err = 43.30
k = 53	dev_err = 42.30
k = 55	dev_err = 42.90
k = 57	dev_err = 43.40
k = 59	dev_err = 43.40
k = 61	dev_err = 43.60
k = 63	dev_err = 43.40
k = 65	dev_err = 43.30
k = 67	dev_err = 44.50
k = 69	dev_err = 44.50
k = 71	dev_err = 44.70
k = 73	dev_err = 44.20
k = 75	dev_err = 44.50
k = 77	dev_err = 45.20
k = 79	dev_err = 44.60
k = 81	dev_err = 44.30
k = 83	dev_err = 44.50
k = 85	dev_