In [None]:
import pandas as pd
import spacy

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
#Carregando dados de treino
df_train = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')

print(df_train['target'].value_counts())

In [None]:
#Balanceando os dados para treino
X_train_insincere = df_train.loc[df_train['target'] == 1]

In [None]:
#Sampleando questões sinceras
size = round((len(X_train_insincere)*3),1)

X_train_aux = df_train.loc[df_train['target'] == 0]
X_train_sincere, X_descartar, Y_train_sincere, Y_descartar = train_test_split(X_train_aux['question_text'],
                                                                       X_train_aux['target'],
                                                                       train_size=size,
                                                                       random_state=42)

In [None]:
#Juntando as questões insinceras com o sample das sinceras
X_train_npp = pd.concat([X_train_insincere['question_text'], X_train_sincere])
Y_train = pd.concat([X_train_insincere['target'], Y_train_sincere])

In [None]:
#Carregando dados de teste
df_teste = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

X_test_npp = df_teste['question_text']

In [None]:
#Pré processamento
def preprocess(conteudo):
    nlp = spacy.load('en_core_web_lg')
    texto_pp = ''
    en_stop = set(stopwords.words('english'))
    conteudo_pp = []

    for item in conteudo:
        pp = nlp(item)
        for token in pp:
            if not (token.is_punct or (str(token).lower() in en_stop) or token.like_num):
                texto_pp += str(token).lower() + ' '
        conteudo_pp.append(texto_pp)
        texto_pp = ''

    return conteudo_pp

In [None]:
#Pré processando treino
X_train_pp = preprocess(X_train_npp)

In [None]:
#Pré processando teste
X_test_pp = preprocess(X_test_npp)

In [None]:
#Vetorizando
vectorizer = TfidfVectorizer(use_idf=True)
tfidf_model = vectorizer.fit(X_train_pp)
X_tfidf_train = tfidf_model.transform(X_train_pp)
X_tfidf_test = tfidf_model.transform(X_test_pp)

In [None]:
#Criando modelo
clf = MultinomialNB()

In [None]:
#Treinando
clf.fit(X_tfidf_train, Y_train)

In [None]:
#Predizendo
predicts = clf.predict(X_tfidf_test)

In [None]:
#Convertendo resultados para submissão
predicts = pd.Series(predicts)

subm_df = pd.read_csv('../input/quora-insincere-questions-classification/sample_submission.csv')
subm_df['prediction'] = predicts
subm_df.to_csv('submission.csv', index=False)