In [None]:
import pandas as pd
import sklearn.feature_extraction.text as ft
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords
import re

# nltk.download('stopwords')


def clean(sen):
    text = sen.lower()
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    text = text.replace(r'&amp;?', r'and')
    text = text.replace(r'&lt;', r'<')
    text = text.replace(r'&gt;', r'>')
    text = re.sub(r"(?:\@)\w+", '', text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+', '', text)
    text = re.sub(r'[!]+', '!', text)
    text = re.sub(r'[?]+', '?', text)
    text = re.sub(r'[.]+', '.', text)
    text = re.sub(r"'", "", text)
    text = re.sub(r"\(", "", text)
    text = re.sub(r"\)", "", text)

    text = " ".join(text.split())

    return text


train_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

categories = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

stop_words = set(stopwords.words('english'))
vectorizer = ft.TfidfVectorizer(stop_words=stop_words)

X = train_df['comment_text'].apply(lambda text: clean(text))
y = train_df[categories]

x_train, x_test, y_train, y_test = train_test_split(vectorizer.fit_transform(X), y,test_size=0.2,random_state=42)

KNN = KNeighborsClassifier(n_neighbors=8)
KNN.fit(x_train, y_train)

test_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

X = vectorizer.transform(test_df['comment_text'].apply(lambda text: clean(text)))
y = KNN.predict_proba(X)

df_submit = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

for idx,cat in enumerate(categories):
    df_submit[cat] = y[idx][:,1]

df_submit.to_csv('submission.csv',index=False)