# Table of Content:

1. Load the Data
2. Quick view of Data
3. EDA
    * Data Visualization
    * Text Preprocessing
    * Word Embeddings
4. Build Basic Linear Model
5. Evaluate the results


In [None]:
import numpy as np 
import pandas as pd 
import os
import time
from tqdm import tqdm
import itertools
import h2o

import lightgbm as lgb

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix

import nltk
from nltk.corpus import stopwords
import string
import re
from nltk.util import ngrams
from collections import Counter
from collections import defaultdict
from spacy.lang.en.stop_words import STOP_WORDS

from scipy.sparse import hstack

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches

from IPython.core.display import display, HTML

import warnings
warnings.filterwarnings("ignore")

plt.style.use('fivethirtyeight')

%matplotlib inline

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Load the Data

In [None]:
train = pd.read_csv('/kaggle/input/quora-insincere-questions-classification//train.csv').fillna(' ')
test = pd.read_csv('/kaggle/input/quora-insincere-questions-classification//test.csv').fillna(' ')

In [None]:
def display_df(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))

## 2. Quick view of Data

In [None]:
display_df([train.sample(5), test.sample(5)], ['Train', 'Test'])

In [None]:
print(f"Train shape: {train.shape} ||  Test shape:{test.shape}")

## 3. EDA
 **Data Visualization**

In [None]:
train['target'].value_counts(normalize=True)*100

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=train, x='target');

In [None]:
fig, axes = plt.subplots(1,len(train.target.unique()), figsize=(20,8))
fig.suptitle('Unigram Analysis')

for index,target in enumerate(train.target.unique()):
    dct=defaultdict(int) 
    curdf=train[train['target']==target]  
    allwordsarr=curdf.question_text.str.cat().split()
    counter=Counter(allwordsarr)
    most=counter.most_common()
    x=[]
    y=[]
    for word,count in most[:100]:
        if (word.lower() not in STOP_WORDS):
            x.append(word)
            y.append(count)
    sns.barplot(ax=axes[index],x=y,y=x)
    axes[index].set_title("Target: "+str(target))

In [None]:
def get_top_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

print("Bigram analysis")

fig, axes = plt.subplots(1,len(train.target.unique()), figsize=(20,8))
fig.suptitle('Bigram analysis')

for index,target in enumerate(train.target.unique()):
    dct=defaultdict(int) 
    top_bigrams=get_top_bigrams(train[train['target']==target].question_text)[:50]
    x,y=map(list,zip(*top_bigrams))
    sns.barplot(ax=axes[index],x=y,y=x)
    axes[index].set_title("Target: "+str(target))

fig.show()

## 3. EDA
 **Text Preprocessing**

In [None]:
#Remove bad symbols and stopwords from test and train data
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()   # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(" ", text)     # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub("", text)     # delete symbols which are in BAD_SYMBOLS_RE from text
    
    
    resultwords = [word for word in text.split() if word not in STOPWORDS]  # delete stopwords from text
    text = ' '.join(resultwords)
    
    return text

In [None]:
train['question_text_cleaned'] = [text_prepare(x) for x in train['question_text']]

In [None]:
display_df([pd.DataFrame(train['question_text']).sample(5, random_state=42), 
            pd.DataFrame(train['question_text_cleaned']).sample(5, random_state=42)], ['Raw Data', 'Cleaned Data'])

## 3. EDA
 **Word Embeddings**
> CountVectorizer  

In [None]:
def cv(data):
    count_vectorizer = CountVectorizer()
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

In [None]:
list_corpus = train['question_text_cleaned'].tolist()
list_labels = train['target'].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                random_state=40)

In [None]:
X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [None]:
list(count_vectorizer.vocabulary_.items())[:10]

## 4. Build Basic Linear Model

In [None]:
def plot_LSA(test_data, test_labels, savepath="PCA.csv", plot=True):
        lsa = TruncatedSVD(n_components=2)
        lsa.fit(test_data)
        lsa_scores = lsa.transform(test_data)
        color_mapper = {label:idx for idx,label in enumerate(set(test_labels))}
        color_column = [color_mapper[label] for label in test_labels]
        colors = ['blue','red']
        if plot:
            plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, c=test_labels, 
                        cmap=matplotlib.colors.ListedColormap(colors))
            red_patch = mpatches.Patch(color='blue', label='0')
            green_patch = mpatches.Patch(color='red', label='1')
            plt.legend(handles=[red_patch, green_patch], prop={'size': 25})

In [None]:
fig = plt.figure(figsize=(10, 10))          
plot_LSA(X_train_counts, y_train)
plt.show()

In [None]:
clf = LogisticRegression(C=0.5, class_weight='balanced', solver='sag', n_jobs=-1, random_state=40)
clf.fit(X_train_counts, y_train)

y_predicted_counts = clf.predict(X_test_counts)

In [None]:
def opt_f1(y_test, y_pred):
    opt_prob = None
    f1_max = 0

    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        f1 = f1_score(y_test, (y_pred > thresh).astype(int))
        print('F1 score at threshold {} is {}'.format(thresh, f1))

        if f1 > f1_max:
            f1_max = f1
            opt_prob = thresh

    print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))
    return opt_prob

In [None]:
print("F1 Score: ", f1_score(y_test, y_predicted_counts))

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.viridis):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=15)
    plt.yticks(tick_marks, classes, fontsize=15)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", 
                 color="white" if cm[i, j] < thresh else "black", fontsize=30)
    
    plt.tight_layout()
    plt.ylabel('True label', fontsize=20)
    plt.xlabel('Predicted label', fontsize=20)

    return plt

## 5. Evaluate the results

In [None]:
cm = confusion_matrix(y_test, y_predicted_counts)
fig = plt.figure(figsize=(7, 7))
plot = plot_confusion_matrix(cm, classes=['OK','Toxic'], normalize=False, title='Confusion matrix')
plt.show();