# The objective of this project is to classify the data into Sincere and Insincere questions
## Feature Extraction Data Techniques Used
### Count Vectorizer with Logistic Regression and Naive Bayes
### Tfidf Vectorizer with Logistic Regression and Naive Bayes
### HashingVectorizer with Logistic Regression


## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import string

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette

%matplotlib inline

from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import warnings
warnings.filterwarnings("ignore")

## Data Overview

### Available input files

In [None]:
ls ../input/quora-insincere-questions-classification

### Unzipping the embeddings

In [None]:
from zipfile import ZipFile 
!unzip ../input/quora-insincere-questions-classification/embeddings

In [None]:
!ls /kaggle/working/

### Loading the data into training data and test data

In [None]:
train_path = "../input/quora-insincere-questions-classification/train.csv"
test_path = "../input/quora-insincere-questions-classification/test.csv"
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

### Get the rows and columns of the training and test dataset

In [None]:
print(f"There are {train_data.shape[0]} Rows and {train_data.shape[1]} Columns inside train data")
print(f"There are {train_data.shape[0]} questions in total in the training dataset")
print(f"There are {test_data.shape[0]} Rows and {test_data.shape[1]} Columns inside test data")
print(f"There are {test_data.shape[0]} questions in total in the test dataset")

### Info about the training data and test data

In [None]:
train_data.info()

In [None]:
test_data.info()

### Total number of sincere (0) and Insincere Questions (1)

In [None]:
target_count = train_data['target'].value_counts()
print(target_count)

### Bar chart to plot the target count 

In [None]:
target_count = train_data['target'].value_counts()

barchart_data = go.Bar(
    x=target_count.index,
    y=target_count.values,
    marker=dict(
        color=target_count.values,
        colorscale = 'Picnic',
        reversescale = True
    ),
)

layout = go.Layout(
    title='Target Count',
    font=dict(size=18)
)

data = [barchart_data]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="TargetCount")



### Bar chart for target distribution

In [None]:
# target distribution
labels = (np.array(target_count.index))
sizes = (np.array((target_count / target_count.sum())*100))

piechart_trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Target distribution',
    font=dict(size=18),
    width=600,
    height=600,
)
data = [piechart_trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="target_distribution")

### Word Cloud before Data Preprocessing

In [None]:
# Word Cloud for all training questions
from wordcloud import WordCloud, STOPWORDS

# custom function for plotting the word cloud
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(24.0,16.0), 
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'one', 'br', 'Po', 'th', 'sayi', 'fo', 'Unknown'}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color='black',
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=800, 
                    height=400,
                    mask = mask)
    wordcloud.generate(str(text))
    
    plt.figure(figsize=figure_size)
    
    plt.imshow(wordcloud);
    plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  
    
plot_wordcloud(train_data["question_text"], title="Word Cloud of Questions")

In [None]:
# Word cloud for sincere questions Before Data Preprocessing
plot_wordcloud(train_data[train_data["target"] == 0]["question_text"], title="Word Cloud of Sincere Questions")

In [None]:
# Word cloud for insincere questions before  Data Preprocessing
plot_wordcloud(train_data[train_data["target"] == 1]["question_text"], title="Word Cloud of Insincere Questions")

### Horizontal bar chart of frequently asked questions on both classes


In [None]:
from collections import defaultdict
# Separate the sincere questions from training dataset
train_sincere_data = train_data[train_data["target"] == 0]
train_insincere_data = train_data[train_data["target"] == 1]

# Next step is generating barchart for both the classes
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

# custom function for horizontal bar chart
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

# Bar chart for frequent words sincere questions
sincere_dict = defaultdict(int)
for text in train_sincere_data["question_text"]:
    for word in generate_ngrams(text):
        sincere_dict[word] += 1
        
sincere_dict_sorted = pd.DataFrame(sorted(sincere_dict.items(), key=lambda x: x[1])[::-1])
sincere_dict_sorted.columns = ["word", "wordcount"]
trace_sincere = horizontal_bar_chart(sincere_dict_sorted.head(50), 'red')

# Bar chart for frequent words insincere questions
insincere_dict = defaultdict(int)
for text in train_insincere_data["question_text"]:
    for word in generate_ngrams(text):
        insincere_dict[word] += 1
insincere_dict_sorted = pd.DataFrame(sorted(insincere_dict.items(), key=lambda x: x[1])[::-1])
insincere_dict_sorted.columns = ["word", "wordcount"]
trace_insincere = horizontal_bar_chart(insincere_dict_sorted.head(50), 'red')

# Creating two subplots
fig = subplots.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of sincere questions", 
                                          "Frequent words of insincere questions"])
fig.append_trace(trace_sincere, 1, 1)
fig.append_trace(trace_insincere, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots')

### Bigram Bar chart plot

In [None]:
# Bar chart for frequent words sincere questions
sincere_dict = defaultdict(int)
for text in train_sincere_data["question_text"]:
    for word in generate_ngrams(text, 2):
        sincere_dict[word] += 1
        
sincere_dict_sorted = pd.DataFrame(sorted(sincere_dict.items(), key=lambda x: x[1])[::-1])
sincere_dict_sorted.columns = ["word", "wordcount"]
trace_sincere = horizontal_bar_chart(sincere_dict_sorted.head(50), 'blue')

# Bar chart for frequent words insincere questions
insincere_dict = defaultdict(int)
for text in train_insincere_data["question_text"]:
    for word in generate_ngrams(text, 2):
        insincere_dict[word] += 1
insincere_dict_sorted = pd.DataFrame(sorted(insincere_dict.items(), key=lambda x: x[1])[::-1])
insincere_dict_sorted.columns = ["word", "wordcount"]
trace_insincere = horizontal_bar_chart(insincere_dict_sorted.head(50), 'blue')

# Creating two subplots
fig = subplots.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of sincere questions", 
                                          "Frequent words of insincere questions"])
fig.append_trace(trace_sincere, 1, 1)
fig.append_trace(trace_insincere, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots')

### Trigram

In [None]:
# Bar chart for frequent words sincere questions
sincere_dict = defaultdict(int)
for text in train_sincere_data["question_text"]:
    for word in generate_ngrams(text, 3):
        sincere_dict[word] += 1
        
sincere_dict_sorted = pd.DataFrame(sorted(sincere_dict.items(), key=lambda x: x[1])[::-1])
sincere_dict_sorted.columns = ["word", "wordcount"]
trace_sincere = horizontal_bar_chart(sincere_dict_sorted.head(50), 'green')

# Bar chart for frequent words insincere questions
insincere_dict = defaultdict(int)
for text in train_insincere_data["question_text"]:
    for word in generate_ngrams(text, 3):
        insincere_dict[word] += 1
insincere_dict_sorted = pd.DataFrame(sorted(insincere_dict.items(), key=lambda x: x[1])[::-1])
insincere_dict_sorted.columns = ["word", "wordcount"]
trace_insincere = horizontal_bar_chart(insincere_dict_sorted.head(50), 'green')

# Creating two subplots
fig = subplots.make_subplots(rows=1, cols=2, vertical_spacing=0.04, horizontal_spacing=0.2,
                          subplot_titles=["Frequent words of sincere questions", 
                                          "Frequent words of insincere questions"])
fig.append_trace(trace_sincere, 1, 1)
fig.append_trace(trace_insincere, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
py.iplot(fig, filename='word-plots')

## Feature Engineering
### Features that is added
1. Number of words in the text
2. Number of unique words in the text
3. Number of characters in the text
4. Number of stopwords
5. Number of punctuations
6. Number of upper case words
7. Number of title case words
8. Average length of the words

In [None]:
# Number of words in the text
train_data["num_words"] = train_data["question_text"].apply(lambda x: len(str(x).split()))
test_data["num_words"] = test_data["question_text"].apply(lambda x: len(str(x).split()))

# Number of unique words in the text
train_data["num_unique_words"] = train_data["question_text"].apply(lambda x: len(set(str(x).split())))
test_data["num_unique_words"] = test_data["question_text"].apply(lambda x: len(set(str(x).split())))

# Number of characters in the text
train_data["num_chars"] = train_data["question_text"].apply(lambda x: len(str(x)))
test_data["num_chars"] = test_data["question_text"].apply(lambda x: len(str(x)))

# Number of stopwords in the text
train_data["num_stopwords"] = train_data["question_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
test_data["num_stopwords"] = test_data["question_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# Number of punctuations in the text
train_data["num_punctuations"] = train_data['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test_data["num_punctuations"] = test_data['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

# Number of title case words in the text
train_data["num_words_upper"] = train_data["question_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_data["num_words_upper"] = test_data["question_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

# Number of title case words in the text
train_data["num_words_title"] = train_data["question_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test_data["num_words_title"] = test_data["question_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

# Average length of the words in the text
train_data["mean_word_len"] = train_data["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_data["mean_word_len"] = test_data["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

### Box plot for truncated features

In [None]:
## Truncate some extreme values for better visuals ##
train_data['num_words'].loc[train_data['num_words']>60] = 60 #truncation for better visuals
train_data['num_punctuations'].loc[train_data['num_punctuations']>10] = 10 #truncation for better visuals
train_data['num_chars'].loc[train_data['num_chars']>350] = 350 #truncation for better visuals

f, axes = plt.subplots(3, 1, figsize=(10,20))
sns.boxplot(x='target', y='num_words', data=train_data, ax=axes[0])
axes[0].set_xlabel('Target', fontsize=12)
axes[0].set_title("Number of words in each class", fontsize=15)

sns.boxplot(x='target', y='num_chars', data=train_data, ax=axes[1])
axes[1].set_xlabel('Target', fontsize=12)
axes[1].set_title("Number of characters in each class", fontsize=15)

sns.boxplot(x='target', y='num_punctuations', data=train_data, ax=axes[2])
axes[2].set_xlabel('Target', fontsize=12)
#plt.ylabel('Number of punctuations in text', fontsize=12)
axes[2].set_title("Number of punctuations in each class", fontsize=15)
plt.show()

In [None]:
# SHowing that the features are added
train_data.head()

In [None]:
# Information about the training data after feature scaling
train_data.info()

## Data Preprocessing
### Agenda
1. Converting questions to lower case
2. Removing the punctuation marks
3. Cleaning numbers
4. Correcting misspelled words
5. removing contractions
6. Removing stop words

### Removing the punctuation marks

In [None]:
# Removing punctuations
punctuation_list =[',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', 
        '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 
        '█', '…', '“', '★', '”', '–', '●', '►', '−', '¢', '¬', '░', '¡', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', 
        '—', '‹', '─', '▒', '：', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲', '¸', '⋅', '‘', '∞', 
        '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '・', '╦', '╣', '╔', '╗', '▬', '❤', '≤', '‡', '√', '◄', '━', 
        '⇒', '▶', '≥', '╝', '♡', '◊', '。', '✈', '≡', '☺', '✔', '↵', '≈', '✓', '♣', '☎', '℃', '◦', '└', '‟', '～', '！', '○', 
        '◆', '№', '♠', '▌', '✿', '▸', '⁄', '□', '❖', '✦', '．', '÷', '｜', '┃', '／', '￥', '╠', '↩', '✭', '▐', '☼', '☻', '┐', 
        '├', '«', '∼', '┌', '℉', '☮', '฿', '≦', '♬', '✧', '〉', '－', '⌂', '✖', '･', '◕', '※', '‖', '◀', '‰', '\x97', '↺', 
        '∆', '┘', '┬', '╬', '،', '⌘', '⊂', '＞', '〈', '⎙', '？', '☠', '⇐', '▫', '∗', '∈', '≠', '♀', '♔', '˚', '℗', '┗', '＊', 
        '┼', '❀', '＆', '∩', '♂', '‿', '∑', '‣', '➜', '┛', '⇓', '☯', '⊖', '☀', '┳', '；', '∇', '⇑', '✰', '◇', '♯', '☞', '´', 
        '↔', '┏', '｡', '◘', '∂', '✌', '♭', '┣', '┴', '┓', '✨', '\xa0', '˜', '❥', '┫', '℠', '✒', '［', '∫', '\x93', '≧', '］', 
        '\x94', '∀', '♛', '\x96', '∨', '◎', '↻', '⇩', '＜', '≫', '✩', '✪', '♕', '؟', '₤', '☛', '╮', '␊', '＋', '┈', '％', 
        '╋', '▽', '⇨', '┻', '⊗', '￡', '।', '▂', '✯', '▇', '＿', '➤', '✞', '＝', '▷', '△', '◙', '▅', '✝', '∧', '␉', '☭', 
        '┊', '╯', '☾', '➔', '∴', '\x92', '▃', '↳', '＾', '׳', '➢', '╭', '➡', '＠', '⊙', '☢', '˝', '∏', '„', '∥', '❝', '☐', 
        '▆', '╱', '⋙', '๏', '☁', '⇔', '▔', '\x91', '➚', '◡', '╰', '\x85', '♢', '˙', '۞', '✘', '✮', '☑', '⋆', 'ⓘ', '❒', 
        '☣', '✉', '⌊', '➠', '∣', '❑', '◢', 'ⓒ', '\x80', '〒', '∕', '▮', '⦿', '✫', '✚', '⋯', '♩', '☂', '❞', '‗', '܂', '☜', 
        '‾', '✜', '╲', '∘', '⟩', '＼', '⟨', '·', '✗', '♚', '∅', 'ⓔ', '◣', '͡', '‛', '❦', '◠', '✄', '❄', '∃', '␣', '≪', '｢', 
        '≅', '◯', '☽', '∎', '｣', '❧', '̅', 'ⓐ', '↘', '⚓', '▣', '˘', '∪', '⇢', '✍', '⊥', '＃', '⎯', '↠', '۩', '☰', '◥', 
        '⊆', '✽', '⚡', '↪', '❁', '☹', '◼', '☃', '◤', '❏', 'ⓢ', '⊱', '➝', '̣', '✡', '∠', '｀', '▴', '┤', '∝', '♏', 'ⓐ', 
        '✎', ';', '␤', '＇', '❣', '✂', '✤', 'ⓞ', '☪', '✴', '⌒', '˛', '♒', '＄', '✶', '▻', 'ⓔ', '◌', '◈', '❚', '❂', '￦', 
        '◉', '╜', '̃', '✱', '╖', '❉', 'ⓡ', '↗', 'ⓣ', '♻', '➽', '׀', '✲', '✬', '☉', '▉', '≒', '☥', '⌐', '♨', '✕', 'ⓝ', 
        '⊰', '❘', '＂', '⇧', '̵', '➪', '▁', '▏', '⊃', 'ⓛ', '‚', '♰', '́', '✏', '⏑', '̶', 'ⓢ', '⩾', '￠', '❍', '≃', '⋰', '♋', 
        '､', '̂', '❋', '✳', 'ⓤ', '╤', '▕', '⌣', '✸', '℮', '⁺', '▨', '╨', 'ⓥ', '♈', '❃', '☝', '✻', '⊇', '≻', '♘', '♞', 
        '◂', '✟', '⌠', '✠', '☚', '✥', '❊', 'ⓒ', '⌈', '❅', 'ⓡ', '♧', 'ⓞ', '▭', '❱', 'ⓣ', '∟', '☕', '♺', '∵', '⍝', 'ⓑ', 
        '✵', '✣', '٭', '♆', 'ⓘ', '∶', '⚜', '◞', '்', '✹', '➥', '↕', '̳', '∷', '✋', '➧', '∋', '̿', 'ͧ', '┅', '⥤', '⬆', '⋱', 
        '☄', '↖', '⋮', '۔', '♌', 'ⓛ', '╕', '♓', '❯', '♍', '▋', '✺', '⭐', '✾', '♊', '➣', '▿', 'ⓑ', '♉', '⏠', '◾', '▹', 
        '⩽', '↦', '╥', '⍵', '⌋', '։', '➨', '∮', '⇥', 'ⓗ', 'ⓓ', '⁻', '⎝', '⌥', '⌉', '◔', '◑', '✼', '♎', '♐', '╪', '⊚', 
        '☒', '⇤', 'ⓜ', '⎠', '◐', '⚠', '╞', '◗', '⎕', 'ⓨ', '☟', 'ⓟ', '♟', '❈', '↬', 'ⓓ', '◻', '♮', '❙', '♤', '∉', '؛', 
        '⁂', 'ⓝ', '־', '♑', '╫', '╓', '╳', '⬅', '☔', '☸', '┄', '╧', '׃', '⎢', '❆', '⋄', '⚫', '̏', '☏', '➞', '͂', '␙', 
        'ⓤ', '◟', '̊', '⚐', '✙', '↙', '̾', '℘', '✷', '⍺', '❌', '⊢', '▵', '✅', 'ⓖ', '☨', '▰', '╡', 'ⓜ', '☤', '∽', '╘', 
        '˹', '↨', '♙', '⬇', '♱', '⌡', '⠀', '╛', '❕', '┉', 'ⓟ', '̀', '♖', 'ⓚ', '┆', '⎜', '◜', '⚾', '⤴', '✇', '╟', '⎛', 
        '☩', '➲', '➟', 'ⓥ', 'ⓗ', '⏝', '◃', '╢', '↯', '✆', '˃', '⍴', '❇', '⚽', '╒', '̸', '♜', '☓', '➳', '⇄', '☬', '⚑', 
        '✐', '⌃', '◅', '▢', '❐', '∊', '☈', '॥', '⎮', '▩', 'ு', '⊹', '‵', '␔', '☊', '➸', '̌', '☿', '⇉', '⊳', '╙', 'ⓦ', 
        '⇣', '｛', '̄', '↝', '⎟', '▍', '❗', '״', '΄', '▞', '◁', '⛄', '⇝', '⎪', '♁', '⇠', '☇', '✊', 'ி', '｝', '⭕', '➘', 
        '⁀', '☙', '❛', '❓', '⟲', '⇀', '≲', 'ⓕ', '⎥', '\u06dd', 'ͤ', '₋', '̱', '̎', '♝', '≳', '▙', '➭', '܀', 'ⓖ', '⇛', '▊', 
        '⇗', '̷', '⇱', '℅', 'ⓧ', '⚛', '̐', '̕', '⇌', '␀', '≌', 'ⓦ', '⊤', '̓', '☦', 'ⓕ', '▜', '➙', 'ⓨ', '⌨', '◮', '☷', 
        '◍', 'ⓚ', '≔', '⏩', '⍳', '℞', '┋', '˻', '▚', '≺', 'ْ', '▟', '➻', '̪', '⏪', '̉', '⎞', '┇', '⍟', '⇪', '▎', '⇦', '␝', 
        '⤷', '≖', '⟶', '♗', '̴', '♄', 'ͨ', '̈', '❜', '̡', '▛', '✁', '➩', 'ா', '˂', '↥', '⏎', '⎷', '̲', '➖', '↲', '⩵', '̗', '❢', 
        '≎', '⚔', '⇇', '̑', '⊿', '̖', '☍', '➹', '⥊', '⁁', '✢']

In [None]:
def remove_punctuation(text):
    for punctuation in punctuation_list:
        if punctuation in text:
            text = text.replace(punctuation, '{}' .format(punctuation))
    return text

### Cleaning numbers

In [None]:
def clean_numbers(text):
    if bool(re.search(r'\d', text)):
        text = re.sub('[0-9]{5,}', '#####', text)
        text = re.sub('[0-9]{4}', '####', text)
        text = re.sub('[0-9]{3}', '###', text)
        text = re.sub('[0-9]{2}', '##', text)
    return text

### Correcting misspelled words

In [None]:
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'bitcoin', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization', 
                'electroneum':'bitcoin','nanodegree':'degree','hotstar':'star','dream11':'dream','ftre':'fire','tensorflow':'framework','unocoin':'bitcoin',
                'lnmiit':'limit','unacademy':'academy','altcoin':'bitcoin','altcoins':'bitcoin','litecoin':'bitcoin','coinbase':'bitcoin','cryptocurency':'cryptocurrency',
                'simpliv':'simple','quoras':'quora','schizoids':'psychopath','remainers':'remainder','twinflame':'soulmate','quorans':'quora','brexit':'demonetized',
                'iiest':'institute','dceu':'comics','pessat':'exam','uceed':'college','bhakts':'devotee','boruto':'anime',
                'cryptocoin':'bitcoin','blockchains':'blockchain','fiancee':'fiance','redmi':'smartphone','oneplus':'smartphone','qoura':'quora','deepmind':'framework','ryzen':'cpu','whattsapp':'whatsapp',
                'undertale':'adventure','zenfone':'smartphone','cryptocurencies':'cryptocurrencies','koinex':'bitcoin','zebpay':'bitcoin','binance':'bitcoin','whtsapp':'whatsapp',
                'reactjs':'framework','bittrex':'bitcoin','bitconnect':'bitcoin','bitfinex':'bitcoin','yourquote':'your quote','whyis':'why is','jiophone':'smartphone',
                'dogecoin':'bitcoin','onecoin':'bitcoin','poloniex':'bitcoin','7700k':'cpu','angular2':'framework','segwit2x':'bitcoin','hashflare':'bitcoin','940mx':'gpu',
                'openai':'framework','hashflare':'bitcoin','1050ti':'gpu','nearbuy':'near buy','freebitco':'bitcoin','antminer':'bitcoin','filecoin':'bitcoin','whatapp':'whatsapp',
                'empowr':'empower','1080ti':'gpu','crytocurrency':'cryptocurrency','8700k':'cpu','whatsaap':'whatsapp','g4560':'cpu','payymoney':'pay money',
                'fuckboys':'fuck boys','intenship':'internship','zcash':'bitcoin','demonatisation':'demonetization','narcicist':'narcissist','mastuburation':'masturbation',
                'trignometric':'trigonometric','cryptocurreny':'cryptocurrency','howdid':'how did','crytocurrencies':'cryptocurrencies','phycopath':'psychopath',
                'bytecoin':'bitcoin','possesiveness':'possessiveness','scollege':'college','humanties':'humanities','altacoin':'bitcoin','demonitised':'demonetized',
                'brasília':'brazilia','accolite':'accolyte','econimics':'economics','varrier':'warrier','quroa':'quora','statergy':'strategy','langague':'language',
                'splatoon':'game','7600k':'cpu','gate2018':'gate 2018','in2018':'in 2018','narcassist':'narcissist','jiocoin':'bitcoin','hnlu':'hulu','7300hq':'cpu',
                'weatern':'western','interledger':'blockchain','deplation':'deflation', 'cryptocurrencies':'cryptocurrency', 'bitcoin':'blockchain cryptocurrency',}

In [None]:
import re
def get_misspelled_dict_and_regex(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = get_misspelled_dict_and_regex(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

## Removing Contractions

In [None]:
contraction_dict = {
    "ain't": "is not", 
    "aren't": "are not",
    "can't": "cannot", 
    "'cause": "because", 
    "could've": "could have", 
    "couldn't": "could not", 
    "didn't": "did not",  
    "doesn't": "does not", 
    "don't": "do not", 
    "hadn't": "had not", 
    "hasn't": "has not", 
    "haven't": "have not", 
    "he'd": "he would",
    "he'll": "he will", 
    "he's": "he is", 
    "how'd": "how did", 
    "how'd'y": "how do you", 
    "how'll": "how will", 
    "how's": "how is",  
    "I'd": "I would", 
    "I'd've": "I would have",
    "I'll": "I will", 
    "I'll've": "I will have",
    "I'm": "I am", 
    "I've": "I have", 
    "i'd": "i would", 
    "i'd've": "i would have", 
    "i'll": "i will",  
    "i'll've": "i will have",
    "i'm": "i am", 
    "i've": "i have", 
    "isn't": "is not", 
    "it'd": "it would", 
    "it'd've": "it would have", 
    "it'll": "it will", 
    "it'll've": "it will have",
    "it's": "it is", 
    "let's": "let us", 
    "ma'am": "madam", 
    "mayn't": "may not", 
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have", 
    "must've": "must have", 
    "mustn't": "must not", 
    "mustn't've": "must not have", 
    "needn't": "need not", 
    "needn't've": "need not have",
    "o'clock": "of the clock", 
    "oughtn't": "ought not", 
    "oughtn't've": "ought not have", 
    "shan't": "shall not", 
    "sha'n't": "shall not", "shan't've": "shall not have", 
    "she'd": "she would", "she'd've": "she would have", 
    "she'll": "she will", "she'll've": "she will have", 
    "she's": "she is", "should've": "should have", 
    "shouldn't": "should not", "shouldn't've": "should not have", 
    "so've": "so have","so's": "so as", "this's": "this is",
    "that'd": "that would", "that'd've": "that would have", 
    "that's": "that is", "there'd": "there would", 
    "there'd've": "there would have", "there's": "there is", 
    "here's": "here is","they'd": "they would", "they'd've": "they would have", 
    "they'll": "they will", "they'll've": "they will have", 
    "they're": "they are", "they've": "they have", 
    "to've": "to have", "wasn't": "was not", 
    "we'd": "we would", "we'd've": "we would have", 
    "we'll": "we will", "we'll've": "we will have", 
    "we're": "we are", "we've": "we have", 
    "weren't": "were not", "what'll": "what will", 
    "what'll've": "what will have", "what're": "what are",  
    "what's": "what is", "what've": "what have", "when's": "when is", 
    "when've": "when have", "where'd": "where did", "where's": "where is", 
    "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
    "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", 
    "will've": "will have", "won't": "will not", "won't've": "will not have", 
    "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
    "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
    "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
    "you're": "you are", "you've": "you have"}

In [None]:
def get_contractions_dict_and_regex(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re

contractions, contractions_re = get_contractions_dict_and_regex(contraction_dict)

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

## Removing stopwords

In [None]:
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=True):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [None]:
# Applying all the preprocessing techniques discussed
def clean_questions(x):
    x = x.lower()
    x = remove_punctuation(x)
    x = clean_numbers(x)
    x = replace_typical_misspell(x)
    x = remove_stopwords(x)
    x = replace_contractions(x)
    x = x.replace("'","")
    return x

## Doing feature engineering again after data preprocessing

In [None]:
train_data['preprocessed_question_text'] = train_data['question_text'].apply(lambda x: clean_questions(x))
test_data['preprocessed_question_text'] = test_data['question_text'].apply(lambda x: clean_questions(x))

In [None]:
train_data.info()

### Word Cloud Comparison of Insincere Questions (Before and after preprocessing)

In [None]:
# word cloud before preprocessing
plot_wordcloud(train_data[train_data["target"] == 1]["question_text"], title="Word Cloud of Insincere Questions Before Preprocessing")

# word cloud after preprocessing
plot_wordcloud(train_data[train_data["target"] == 1]["preprocessed_question_text"], title="Word Cloud of Insincere Questions After Preprocessing")

## Building Vectorizers and models

In [None]:
import copy
import time
from sklearn.metrics.classification import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer 
from sklearn.naive_bayes import MultinomialNB

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn import metrics

## Count Vectorizer

In [None]:
# Creating CountVectorizer object
vectorizer = CountVectorizer(
    dtype=np.float32, 
    strip_accents='unicode', 
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3),
    min_df=3
)
# Fit the vectorizer on training data after preprocessing
vectorizer.fit_transform(train_data['preprocessed_question_text'].values.tolist() + test_data['preprocessed_question_text'].values.tolist())
train_vectorizer = vectorizer.transform(train_data['preprocessed_question_text'].values.tolist())
test_vectorizer = vectorizer.transform(test_data['preprocessed_question_text'].values.tolist())

In [None]:
# For storing the threshold values and f1 score
threshold_list = []
best_f1_score_list = []

### Custom function for building model and finding f1 score

In [None]:
train_y = train_data["target"].values

def buildModel(train_X, train_y, test_X, test_y, test_X2, model_obj):
    model = copy.deepcopy(model_obj)
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)[:,1]
    pred_test_y2 = model.predict_proba(test_X2)[:,1]
    return pred_test_y, pred_test_y2, model

def best_threshold_function(val_y, pred_val_y):
    threshold_dict = {}
    for thresh in np.arange(0.1, 0.201, 0.01):
        thresh = np.round(thresh, 2)
        # Updating the dict with threshold as key and f1 score as value
        threshold_dict[thresh] =  metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
        
    # Finding the max key
    best_threshold = max(threshold_dict, key=threshold_dict.get)
    
    # finding the max value
    best_f1_score = max(threshold_dict.values())
    
    print(f"Best F1 Score: {best_f1_score} for threshold {best_threshold}")
    # Appending the f1 score and threshold for count vectorizer
    threshold_list.append(best_threshold)
    best_f1_score_list.append(best_f1_score)



### Logistic Regression

In [None]:
cv_scores = []
pred_full_test = 0
# Creating a zero list equal to the shape of training data
pred_train = np.zeros([train_data.shape[0]])

# kfold with 5 n_splits
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)

for dev_index, val_index in kf.split(train_data):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, LogisticRegression(C=5., solver='sag'))
    pred_full_test = pred_full_test + pred_test_y
    
    # Updating the pred_train list with prediction value
    pred_train[val_index] = pred_val_y
    
    # appending the cv scores
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

### Naive Bayes

In [None]:
cv_scores = []
pred_full_test = 0
# Creating a zero list equal to the shape of training data
pred_train = np.zeros([train_data.shape[0]])

# kfold with 5 n_splits
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train_data):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, MultinomialNB())
    pred_full_test = pred_full_test + pred_test_y
    
    # Updating the pred_train list with prediction value
    pred_train[val_index] = pred_val_y
    
    # appending the cv scores
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

## TFIDF Vextorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
vectorizer.fit_transform(train_data['preprocessed_question_text'].values.tolist() + test_data['preprocessed_question_text'].values.tolist())
train_vectorizer = vectorizer.transform(train_data['preprocessed_question_text'].values.tolist())
test_vectorizer = vectorizer.transform(test_data['preprocessed_question_text'].values.tolist())

### Logistic Regression

In [None]:
cv_scores = []
pred_full_test = 0

# Creating a zero list equal to the shape of training data
pred_train = np.zeros([train_data.shape[0]])

# kfold with 5 n_splits
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train_data):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, LogisticRegression(C=5., solver='sag'))
    pred_full_test = pred_full_test + pred_test_y
    
    # Updating the pred_train list with prediction value
    pred_train[val_index] = pred_val_y
    
     # appending the cv scores
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

### Naive Bayes

In [None]:
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0]])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train_data):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, MultinomialNB())
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

## Hashing Vectorizer

In [None]:
vectorizer = HashingVectorizer(
    dtype=np.float32,
    strip_accents='unicode', 
    analyzer='word',
    ngram_range=(1, 3),
    n_features=2**10
)
vectorizer.fit_transform(train_data['preprocessed_question_text'].values.tolist() + test_data['preprocessed_question_text'].values.tolist())
train_vectorizer = vectorizer.transform(train_data['preprocessed_question_text'].values.tolist())
test_vectorizer = vectorizer.transform(test_data['preprocessed_question_text'].values.tolist())

### Logistic Regression

In [None]:
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0]])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train_data):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, LogisticRegression(C=5., solver='sag'))
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

## Comparing all the models

In [None]:

from prettytable import PrettyTable
    
table = PrettyTable()
vect = (["CountVectorizer"] * 2) + (["TFIDFVectorizer"] * 2) + (["HashingVectorizer"])
model = (["Logistic Regression", "Naive Bayes"] * 2) + (["Logistic Regression"])
table.add_column("Model", model)
table.add_column("Vectorizer", vect)
table.add_column("Test F1-Score", best_f1_score_list)
table.add_column("Best Threshold", threshold_list)

In [None]:
print(table)

## Model Training for best f1 score
### Creating Count vectorizer and model

In [None]:
# Creating CountVectorizer object
vectorizer = CountVectorizer(
    dtype=np.float32, 
    strip_accents='unicode', 
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3),
    min_df=3
)
# Fit the vectorizer on training data after preprocessing
vectorizer.fit_transform(train_data['preprocessed_question_text'].values.tolist() + test_data['preprocessed_question_text'].values.tolist())
train_vectorizer = vectorizer.transform(train_data['preprocessed_question_text'].values.tolist())
test_vectorizer = vectorizer.transform(test_data['preprocessed_question_text'].values.tolist())

### Building and running model

In [None]:
cv_scores = []
pred_full_test = 0
# Creating a zero list equal to the shape of training data
pred_train = np.zeros([train_data.shape[0]])

# kfold with 5 n_splits
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)

for dev_index, val_index in kf.split(train_data):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, LogisticRegression(C=5., solver='sag'))
    pred_full_test = pred_full_test + pred_test_y
    
    # Updating the pred_train list with prediction value
    pred_train[val_index] = pred_val_y
    
    # appending the cv scores
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

## Result - Output to csv file

In [None]:
pred_full_test = (pred_full_test > 0.2).astype(int)
output = pd.DataFrame({
    "qid":test_data["qid"].values, 
    "prediction": pred_full_test
})
output.to_csv("submission.csv", index=False)