In [None]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")
import spacy
nlp = spacy.load('en')
train = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
print(f"{len(train):,} total training datapoints")

In [None]:
train.head()

In [None]:
train.drop('qid', axis=1, inplace=True)
all_labels = train.pop('target')

In [None]:
# print(train.values[:5])
print(all_labels.value_counts())
print(all_labels.value_counts(True))

In [None]:
# Preprocessing 
import re
#Cleaning Special Characters
NON_CHARACTER = re.compile(r'[^A-Za-z]+')
#Cleaning Number
NUMS = re.compile(r'\d+')

In [None]:
#Replace misspelled words using a misspell mapping and regex functions
mispell_dict = {"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
l = WordNetLemmatizer()

In [None]:
def clean_sentences(text): 
    text = text.lower().replace('\\', '\\\\')
    text = NUMS.sub('XXX', text)
    text = NON_CHARACTER.sub(' ', text)
    text = replace_typical_misspell(text)
    # remove endings only if the base form is present in a dictionary
    text = ' '.join([l.lemmatize(word) for word in word_tokenize(text)])
    return text 

all_texts = np.array([clean_sentences(x[0]) for x in train.values])
print("Done!")

In [None]:
train_x, test_x, train_y, test_y = train_test_split(
    all_texts, all_labels.values, test_size=0.2, random_state=2019)
print("Test-train split done!")

In [None]:
vectorizer = CountVectorizer()
vectorizer2 = CountVectorizer(min_df=0.0001, max_df=0.999, max_features=5000, ngram_range=(1,2,)) 
bow_train = vectorizer.fit_transform(train_x) 
bow_train2 = vectorizer2.fit_transform(train_x) 
print(bow_train.shape)
print(bow_train2.shape)
bow_test = vectorizer.transform(test_x)
bow_test2 = vectorizer2.transform(test_x)
print("Done creating Bag-of-Words")

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
print(f"Results of decision tree on full bag-of-words")
dtc = DecisionTreeClassifier(max_depth=30) 
dtc.fit(bow_train, train_y)
train_predictions = dtc.predict(bow_train)
train_acc = accuracy_score(train_y, train_predictions) 
train_f1 = f1_score(train_y, train_predictions) 
print(f"Training accuracy: {train_acc:.2%}, F1: {train_f1:.4f}, %1: {sum(train_predictions)/len(train_predictions):.2%}") 
test_predictions = dtc.predict(bow_test)
test_acc = accuracy_score(test_y, test_predictions) 
test_f1 = f1_score(test_y, test_predictions) 
print(f"Testing accuracy:  {test_acc:.2%}, F1: {test_f1:.4f}, %1: {sum(test_predictions)/len(test_predictions):.2%}")

In [None]:
print(f"Results of decision tree on simplified bag-of-words")
dtc2 = DecisionTreeClassifier(max_depth=30) 
dtc2.fit(bow_train2, train_y)
train_predictions = dtc2.predict(bow_train2)
train_acc = accuracy_score(train_y, train_predictions) 
train_f1 = f1_score(train_y, train_predictions) 
print(f"Training accuracy: {train_acc:.2%}, F1: {train_f1:.4f}, %1: {sum(train_predictions)/len(train_predictions):.2%}") 
test_predictions = dtc2.predict(bow_test2)
test_acc = accuracy_score(test_y, test_predictions) 
test_f1 = f1_score(test_y, test_predictions) 
print(f"Testing accuracy:  {test_acc:.2%}, F1: {test_f1:.4f}, %1: {sum(test_predictions)/len(test_predictions):.2%}")
