In [None]:
import pandas as pd
import numpy as np
import re
import string
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
import sklearn as sk

from sklearn.gaussian_process import GaussianProcessRegressor

df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

train_df, test_df = sk.model_selection.train_test_split(df, test_size=0.3)
valid_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")


In [None]:
def cleanAllPunctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('\n'," ")
    return text

VcleanAllPunctuation = np.vectorize(cleanAllPunctuation)

In [None]:
vecLen = np.vectorize(len)

def averageWordCount(text):
    text = VcleanAllPunctuation(text)
    words = np.char.split(text,' ')
    return np.average((vecLen(words)))

VaverageWordCount = np.vectorize(averageWordCount)

def averageSentenceLength(text):
    sentenceCount = 0.0
    sentenceCount += text.count('.')
    sentenceCount += text.count('!')
    sentenceCount += text.count('?')

    text = VcleanAllPunctuation(text)
    words = np.char.split(text,' ')
    return float(words.size)/sentenceCount

VaverageSentenceLength = np.vectorize(averageSentenceLength)




In [None]:
def vectorizeText(text):

    avWordCount = VaverageWordCount(text).reshape((-1,1))
    avSentenceLength = VaverageSentenceLength(text).reshape((-1,1))

    ret= np.append(avWordCount,avSentenceLength,axis=1)

    return ret

In [None]:
X_train = vectorizeText(train_df['excerpt'].to_numpy())
Y_train =  train_df['target'].to_numpy()

fakeClasses = np.copy(Y_train[:])
fakeClasses[:] = 0

X_test = vectorizeText(test_df['excerpt'])
Y_test = test_df['target'].to_numpy()

X_valid = vectorizeText(valid_df['excerpt'])

#0 mean and unit variance makes the most sense for normalizing our data
normalizer = sk.preprocessing.StandardScaler().fit(X_train)

X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)
X_valid = normalizer.transform(X_valid)

neigh = KNeighborsClassifier(n_neighbors=14)
neigh.fit(X_train, fakeClasses)


def getTrainTarget(index):
    return Y_train[index]

def getTestTarget(index):
    return Y_test[index]


VVgetTrainTarget = np.vectorize(np.vectorize(getTrainTarget))
VVgetTestTarget = np.vectorize(np.vectorize(getTestTarget))

def voteFromNeighbors(neighborTargets):
    return [np.sum(neighborTarget)/(neighborTarget.size) for neighborTarget in neighborTargets]

kNN = neigh.kneighbors(X_train, return_distance=True)
train_predictions = voteFromNeighbors(VVgetTrainTarget(kNN[1]))

print('Train MSE:' , np.average((train_predictions-Y_train)**2))

kNN = neigh.kneighbors(X_test, return_distance=True)
test_predictions = voteFromNeighbors(VVgetTrainTarget(kNN[1]))

print('Test MSE:' , np.average((test_predictions-Y_test)**2))


kNN = neigh.kneighbors(X_valid, return_distance=True)
valid_predictions = voteFromNeighbors(VVgetTrainTarget(kNN[1]))


In [None]:
submission_df = pd.DataFrame({'id': valid_df.id, 'target': valid_predictions})

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)