In [None]:
import numpy as np 
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/sample_submission.csv')
train = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

In [None]:
from unidecode import unidecode
import spacy
import re

nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words

In [None]:
def pre(text):
    sent = []
    text = text.strip().lower()
    text = re.sub('[0-9]{5,}','#####', text)
    text = re.sub('[0-9]{4,}','####', text)
    text = re.sub('[0-9]{3,}','###', text)
    text = re.sub('[0-9]{2,}','##', text)
    text = re.sub(r'(@.*?)[\s]', ' ', text)
    text = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', text)
    text = re.sub(r'[^\w\s\?]', ' ', text)
    text = re.sub(r'([\;\:\|•«\n])', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    roman = re.compile(r'^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$')
    text = roman.sub(r'', text)
    doc = nlp(text)
    for word in doc:
        if word.pos_ == "VERB":
            sent.append(word.lemma_)
        else:
            sent.append(word.orth_)
    return " ".join(sent)

In [None]:
train["question_text"] = train.question_text.apply(unidecode)
train["question_text"] = train.question_text.apply(pre)
test["question_text"] = test.question_text.apply(unidecode)
test["question_text"] = test.question_text.apply(pre)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [None]:
vectorizer = TfidfVectorizer(ngram_range = (1, 1), use_idf = True, norm = 'l2', stop_words = stop_words)

X = train.question_text.values
y = train.target.values

vectorizer.fit(X)

In [None]:
pipe1 = Pipeline([('bow', TfidfVectorizer(ngram_range = (1, 1), stop_words = stop_words, max_df = 0.5, min_df = 2)),
                ('tfid', TfidfTransformer()),
                ('model', XGBClassifier())])

pipe1.fit(X, y)

prediction = pipe1.predict(test.question_text)

df.target = prediction

df.to_csv('./submission.csv', index=False)