# Import

In [None]:
import os
import numpy as np
import pandas as pd
import spacy

import matplotlib.pyplot as plt
import seaborn as sns

sns.set
%matplotlib inline

# Load the dataset

In [None]:
df_train = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
df_test = pd.read_csv(os.path.join('..', 'input', 'test.csv'))

# Look at the dataset

In [None]:
df_train.head()

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.info()

The data is clean, there is no Naan values

In [None]:
df_train['target'].value_counts().plot(kind='bar');

In [None]:
insincere_ratio = (80810 / 1225312) * 100
insincere_ratio

In [None]:
y = df_train['target']
X = df_train['question_text']

In [None]:
X_insincere = X[y == 1]
X_insincere.head()

We can already notice the troll content within the questions.

In [None]:
X_sincere = X[y == 0]
X_sincere.head()

Whereas within the sincere question, the questions are legit.

# Preprocessing

## on X

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
corpus = [word_tokenize(token) for token in X]

In [None]:
lowercase_train = [[token.lower() for token in doc] for doc in corpus]

In [None]:
alphas = [[token for token in doc if token.isalpha()] for doc in lowercase_train]

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
train_no_stop = [[token for token in doc if token not in stop_words] for doc in alphas]

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed = [[stemmer.stem(token) for token in doc] for doc in train_no_stop]

In [None]:
train_clean_str = [ ' '.join(doc) for doc in stemmed]

## Features 

**1)** Number of words

In [None]:
nb_words = [len(tokens) for tokens in alphas]

**2)** Number of unique words

In [None]:
alphas_unique = [set(doc) for doc in alphas]

In [None]:
nb_words_unique = [len(doc) for doc in alphas_unique]

**3)** Number of characters

In [None]:
train_str = [ ' '.join(doc) for doc in lowercase_train]

In [None]:
nb_characters = [len(doc) for doc in train_str]

**4)** Number of stopwords

In [None]:
train_stopwords = [[token for token in doc if token in stop_words] for doc in alphas]

In [None]:
nb_stopwords = [len(doc) for doc in train_stopwords]

**5)** Number of punctuations

In [None]:
non_alphas = [[token for token in doc if token.isalpha() == False] for doc in lowercase_train]

In [None]:
nb_punctuation = [len(doc) for doc in non_alphas]

**6)** Number of title case words

In [None]:
train_title = [[token for token in doc if token.istitle() == True] for doc in corpus]

In [None]:
nb_title = [len(doc) for doc in train_title]

# New Dataframe with features

In [None]:
df_clean = pd.DataFrame(data={'text_clean': train_clean_str})
df_clean.head()

In [None]:
nb_words = pd.Series(nb_words)
nb_words_unique = pd.Series(nb_words_unique)
nb_characters = pd.Series(nb_characters)
nb_stopwords = pd.Series(nb_stopwords)
nb_punctuation = pd.Series(nb_punctuation)
nb_title = pd.Series(nb_title)

In [None]:
df_show = pd.concat([df_clean, nb_words, nb_words_unique, nb_characters, nb_stopwords, nb_punctuation, nb_title], axis=1).rename(columns={
    0: "Number of words", 1: 'Number of unique words', 2: 'Number of characters', 3: 'Number of stopwords', 4: 'Number of punctuations',
    5: 'Number of titlecase words'
})
df_show.head()

In [None]:
df_feat = df_show.drop(['text_clean'], axis=1)
df_feat.head()

In [None]:
df_feat.info()

For now, this represents too much data to visualise. We'll start with the insincere one.

## EDA on the X_insincere

In [None]:
from nltk.tokenize import word_tokenize

Let's tokenize our document

In [None]:
%%time
corpus_insincere = [word_tokenize(t) for t in X_insincere]

Lowercase all the words

In [None]:
lowercase = [[t.lower() for t in doc] for doc in corpus_insincere]

Remove stop words

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
no_stops = [[t for t in doc if t not in stop_words] for doc in lowercase]

Remove non-alpha-numerical caracters

In [None]:
alphas_insincere = [[token for token in doc if token.isalpha()] for doc in no_stops]

Stem the words

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_insincere = [[stemmer.stem(token) for token in doc] for doc in alphas_insincere]

Show the length of "insincere" sentence 

In [None]:
nb_words_insincere_nostop = [len(tokens) for tokens in no_stops]

Average number of words per insincere question 

In [None]:
avg_nostop = np.mean(nb_words_insincere_nostop)
avg_nostop

In [None]:
nb_words_insincere_stop = [len(tokens) for tokens in lowercase]
avg_stop = np.mean(nb_words_insincere_stop)
avg_stop

In [None]:
np.median(nb_words_insincere_nostop)

In [None]:
np.median(nb_words_insincere_stop)

In [None]:
nb_words_insincere_stop = pd.Series(nb_words_insincere_stop)
nb_words_insincere_nostop = pd.Series(nb_words_insincere_nostop)

In [None]:
df_insincere =  pd.DataFrame(X_insincere)

In [None]:
df_insincere.info()

In [None]:
df_insincere = pd.concat([X_insincere.reset_index(), nb_words_insincere_nostop, nb_words_insincere_stop], axis=1).set_index('index').rename(columns={
    0: "nb_words_no_stop", 1: 'nb_words_stop'
})
df_insincere.head()

In [None]:
#plt.hist(nb_words_insincere_nostop, bins=30)
#plt.hist(nb_words_insincere_stop, bins=30)
sns.distplot(np.log1p(nb_words_insincere_nostop), kde=False, label="No stop")
sns.distplot(np.log1p(nb_words_insincere_stop), kde=False, label="Stop")
plt.legend();

In [None]:
sns.distplot(nb_words_insincere_stop, hist=False, color='red', label='Stop')
sns.distplot(nb_words_insincere_nostop, hist=False, color='blue', label='No stop')
plt.legend();

The average number of words within the insincere questions is around **11 words** (without the stop_words) and **19 words** with the stop_words.
Let's compare it with the proper questions.

**Counting the ten most common words in the insincere questiosn**

In [None]:
from collections import defaultdict

counter = defaultdict(int)
for doc in alphas_insincere:
    for token in doc:
        counter[token] += 1

from collections import Counter

c = Counter(counter)

c.most_common(10)

## EDA on the X_sincere

In [None]:
corpus_sincere = [word_tokenize(t) for t in X_sincere]

In [None]:
lowercase_sincere = [[t.lower() for t in doc] for doc in corpus_sincere]

In [None]:
no_stop_sincere = [[t for t in doc if t not in stop_words] for doc in lowercase_sincere]

In [None]:
alphas_sincere = [[token for token in doc if token.isalpha()] for doc in no_stop_sincere]

In [None]:
nb_words_sincere_nostop = [len(t) for t in no_stop_sincere]

In [None]:
avg_words_sincere_nostop = np.mean(nb_words_sincere_nostop)
avg_words_sincere_nostop

In [None]:
nb_words_sincere_stop = [len(t) for t in lowercase_sincere]
avg_words_sincere = np.mean(nb_words_sincere_stop)
avg_words_sincere

In [None]:
np.median(nb_words_sincere_nostop)

In [None]:
np.median(nb_words_sincere_stop)

The average number of words within the sincere questions is around **8 words** (without the stop_words) and **14 words** with the stop_words. Let's compare it with the proper questions.

In [None]:
nb_words_sincere_stop = pd.Series(nb_words_sincere_stop)
nb_words_sincere_nostop = pd.Series(nb_words_sincere_nostop)

In [None]:
df_sincere =  pd.DataFrame(X_sincere)

In [None]:
df_sincere.info()

In [None]:
df_sincere = pd.concat([X_sincere.reset_index(), nb_words_sincere_nostop, nb_words_sincere_stop], axis=1).set_index('index').rename(columns={
    0: "nb_words_no_stop", 1: 'nb_words_stop'
})
df_sincere.head()

**Counting the ten most common words in the sincere questiosn**

In [None]:
from collections import defaultdict

counter_sincere = defaultdict(int)
for doc in alphas_sincere:
    for token in doc:
        counter[token] += 1

from collections import Counter

c_sincere = Counter(counter)

c_sincere.most_common(10)

# Topic Modeling

## Latent Semantic Analysis

Our goal is to determine most common topics among the insincere questions.

First, we use our tokenized document that has been preprocessed.

Then we use Gensim to achieve our LSA.

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(alphas_insincere)

In [None]:
corpus_1 = [dictionary.doc2bow(t) for t in alphas_insincere]

In [None]:
from gensim.models.ldamodel import LdaModel

In [None]:
%%time
lda_model = LdaModel(
    corpus=corpus_1, id2word=dictionary, num_topics=4, random_state=42)

In [None]:
from pprint import pprint
pprint(lda_model.print_topics())

In [None]:
%%time
lda_model_1 = LdaModel(
    corpus=corpus_1, id2word=dictionary, num_topics=4, random_state=42, iterations=10)

In [None]:
from pprint import pprint
pprint(lda_model_1.print_topics())

In [None]:
import pyLDAvis.gensim

In [None]:
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.gensim.prepare(lda_model, corpus_1, dictionary)

In [None]:
pyLDAvis.gensim.prepare(lda_model_1, corpus_1, dictionary)

In [None]:
weight_topic = lda_model_1.top_topics(corpus=corpus_1, dictionary=dictionary, topn=30)

In [None]:
politic, religion, sex, america = weight_topic

In [None]:
politic = politic[0]
politic = [tup[1] for tup in politic]
politic

In [None]:
religion = religion[0]
religion = [tup[1] for tup in religion]
religion

In [None]:
sex = sex[0]
sex = [tup[1] for tup in sex]
sex

In [None]:
america = america[0]
america = [tup[1] for tup in america]
america

 We will now add  a new feature to check whether a question contains at least 2 words of a topic to tag it insincere.

In [None]:
y_labeled = []

In [None]:
for doc in train_no_stop:
    counter = 0
    for word in doc:
        if word in politic or word in religion or word in sex or word in america:
            counter += 1
    if counter >= 3:
        y_labeled.append(1)
    else:
        y_labeled.append(0)
        

In [None]:
y_labeled[:10]

In [None]:
y_labeled = pd.Series(y_labeled)
y_labeled[:3]

In [None]:
df_feat['y_topic_labeled'] = y_labeled

In [None]:
df_show['y_topic_labeled'] = y_labeled

In [None]:
df_feat.head()

In [None]:
df_feat['y_topic_labeled'].value_counts()

In [None]:
df_show.head()

# Machine Learning

We'll try to combine two models : 

1) First, we will process over our raw document as intermediate predictions.


2) Secondly, we will add our predictions as a new feature in our dataframe features and try ML models over them.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Over our raw questions

## Preprocessing

### TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [None]:
tvec = TfidfVectorizer(stop_words='english')

In [None]:
tf = tvec.fit_transform(X_train)
tf

### CountVectorizer

In [None]:
cvec = CountVectorizer(stop_words='english')

### Truncated SVD

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
svd = TruncatedSVD(n_components=100, random_state=42)

### Preprocessing pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
preprocessing_pipeline = Pipeline([('tvec', tvec), ('svd', svd)])

In [None]:
preprocessing_pipeline.fit_transform(X_train)

## Machine learning models

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

### MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mnb = MultinomialNB()

In [None]:
pipe_mnb = Pipeline([('vectorizer', cvec), ('mnb', mnb)])

In [None]:
pipe_mnb.fit(X_train, y_train)

In [None]:
y_pred_mnb = pipe_mnb.predict(X_test)
y_pred_mnb

In [None]:
cm = confusion_matrix(y_test, y_pred_mnb)
cm

In [None]:
cr = classification_report(y_test, y_pred_mnb)
print(cr)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
#pipe_rf = Pipeline([('vectorizer', tvec), ('rf', rf)])

In [None]:
#pipe_rf.fit(X_train, y_train)

In [None]:
#y_pred = pipe_rf.predict(X_test)
#y_pred

In [None]:
#cm = confusion_matrix(y_test, y_pred)
#cm

In [None]:
#cr = classification_report(y_test, y_pred)
#print(cr)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
pipe_lr = Pipeline([('vectorizer', cvec), ('lr', lr)])

In [None]:
pipe_lr.fit(X_train, y_train)

In [None]:
y_pred_lr = pipe_lr.predict(X_test)
y_pred_lr

In [None]:
cm = confusion_matrix(y_test, y_pred_lr)
cm

In [None]:
cr = classification_report(y_test, y_pred_lr)
print(cr)