In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_label = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [None]:
y_toxic = train.toxic
y_severe = train.severe_toxic
y_obscene = train.obscene
y_threat = train.threat
y_insult = train.insult
y_identity = train.identity_hate

In [None]:
from unidecode import unidecode
import spacy
import re

nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words

In [None]:
def pre(text):
    sent = []
    text = text.strip()
    text = re.sub(r'(@.*?)[\s]', ' ', text)
    text = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', text)
    text = re.sub(r'[^\w\s\?]', ' ', text)
    text = re.sub(r'([\;\:\|•«\n])', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    doc = nlp(text)
    for word in doc:
        if word.pos_ == "VERB":
            sent.append(word.lemma_)
        else:
            sent.append(word.orth_)
    return " ".join(sent)

In [None]:
train["comment_text"] = train.comment_text.apply(unidecode)
train["comment_text"] = train.comment_text.apply(pre)
test_data["comment_text"] = test_data.comment_text.apply(unidecode)
test_data["comment_text"] = test_data.comment_text.apply(pre)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [None]:
vectorizer = TfidfVectorizer(ngram_range = (1, 1), use_idf = True, norm = 'l2', stop_words = stop_words)

X = train.comment_text.values

vectorizer.fit(X)

In [None]:
pipe1 = Pipeline([('bow', TfidfVectorizer(ngram_range = (1, 1), stop_words = stop_words, max_df = 0.5, min_df = 2)),
                ('tfid', TfidfTransformer()),
                ('model', XGBClassifier())])

pipe_toxic = pipe1.fit(X, y_toxic)
pipe_severe = pipe1.fit(X, y_severe)
pipe_obscene = pipe1.fit(X, y_obscene)
pipe_threat = pipe1.fit(X, y_threat)
pipe_insult = pipe1.fit(X, y_insult)
pipe_identity = pipe1.fit(X, y_identity)

pred_toxic = pipe_toxic.predict(test_data.comment_text)
pred_severe = pipe_severe.predict(test_data.comment_text)
pred_obscene = pipe_obscene.predict(test_data.comment_text)
pred_threat = pipe_threat.predict(test_data.comment_text)
pred_insult = pipe_insult.predict(test_data.comment_text)
pred_identity = pipe_identity.predict(test_data.comment_text)

df.toxic = pred_toxic
df.severe_toxic = pred_severe
df.obscene = pred_obscene
df.threat = pred_threat
df.insult = pred_insult
df.identity_hate = pred_identity

df.to_csv('./submission.csv', index=False)