In [None]:
#imports
import pandas as pd
import spacy

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
#Carregando set de treino
df_train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
df_train.drop(['id'], axis=1, inplace=True)

X_train_npp = df_train['comment_text']
Y_train_toxic = df_train['toxic']
Y_train_severe_toxic = df_train['severe_toxic']
Y_train_obscene = df_train['obscene']
Y_train_threat = df_train['threat']
Y_train_insult = df_train['insult']
Y_train_identity_hate = df_train['identity_hate']

In [None]:
#Carregando set de teste
df_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
df_test.drop(['id'], axis=1, inplace=True)

X_test_npp = df_test['comment_text']

In [None]:
#Pré processamento
def preprocess(conteudo):
    nlp = spacy.load('en_core_web_lg')
    pp = ''
    texto_pp = ''
    en_stop = set(stopwords.words('english'))
    conteudo_pp = []

    for item in conteudo:
        pp = nlp(item)
        for token in pp:
            if not (str(token).lower() in en_stop):
                texto_pp += str(token) + ' '
        conteudo_pp.append(texto_pp)
        texto_pp = ''

    return conteudo_pp

In [None]:
#Pré processando treino
X_train_pp = preprocess(X_train_npp)

In [None]:
#Pré processando teste
X_test_pp = preprocess(X_test_npp)

In [None]:
#Vetorizando
vectorizer = TfidfVectorizer(use_idf=True)
tfidf_model = vectorizer.fit(X_train_pp)
X_tfidf_treino = tfidf_model.transform(X_train_pp)
X_tfidf_test = tfidf_model.transform(X_test_pp)

In [None]:
#Criando modelo
clf = MultinomialNB()

In [None]:
#Treindando toxic
clf.fit(X_tfidf_treino, Y_train_toxic)
predicts_toxic = clf.predict_proba(X_tfidf_test)

In [None]:
#Treindando severe toxic
clf.fit(X_tfidf_treino, Y_train_severe_toxic)
predicts_severe_toxic = clf.predict_proba(X_tfidf_test)

In [None]:
#Treindando obscene
clf.fit(X_tfidf_treino, Y_train_obscene)
predicts_obscene = clf.predict_proba(X_tfidf_test)

In [None]:
#Treindando threat
clf.fit(X_tfidf_treino, Y_train_threat)
predicts_threat = clf.predict_proba(X_tfidf_test)

In [None]:
#Treindando insult
clf.fit(X_tfidf_treino, Y_train_insult)
predicts_insult = clf.predict_proba(X_tfidf_test)

In [None]:
#Treindando identity hate
clf.fit(X_tfidf_treino, Y_train_identity_hate)
predicts_identity_hate = clf.predict_proba(X_tfidf_test)

In [None]:
#Extraindo as probabilidades
def extract_prob(predicts_prob):
    prob_extracts = []
    for prob in predicts_prob:
        prob_extracts.append(prob[1])

    return prob_extracts

In [None]:
#Convertendo para submissão
predicts_toxic = pd.Series(extract_prob(predicts_toxic)).apply(lambda x: round(float(x), 1))
predicts_severe_toxic = pd.Series(extract_prob(predicts_severe_toxic)).apply(lambda x: round(float(x), 1))
predicts_obscene = pd.Series(extract_prob(predicts_obscene)).apply(lambda x: round(float(x), 1))
predicts_threat = pd.Series(extract_prob(predicts_threat)).apply(lambda x: round(float(x), 1))
predicts_insult = pd.Series(extract_prob(predicts_insult)).apply(lambda x: round(float(x), 1))
predicts_identity_hate = pd.Series(extract_prob(predicts_identity_hate)).apply(lambda x: round(float(x), 1))

In [None]:
#Salvando para submissão
subm_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
subm_df['toxic'] = predicts_toxic
subm_df['severe_toxic'] = predicts_severe_toxic
subm_df['obscene'] = predicts_obscene
subm_df['threat'] = predicts_threat
subm_df['insult'] = predicts_insult
subm_df['identity_hate'] = predicts_identity_hate
subm_df.to_csv('submission.csv', index=False)