# Importation datas

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import scipy.stats

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

pd.options.display.max_columns = 1000

In [None]:
df = pd.read_csv("../input/train.csv")
df.head()

In [None]:
X = df['question_text']
y = df['target']
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Quick EDA

In [None]:
df.info()

NO NAN

In [None]:
df['question_text'][df['question_text'] == ""].sum()

NO empty strings

In [None]:
df['question_text'].shape, df['target'].shape

In [None]:
df['target'].unique()

# Repartition of sincere/unsincere

In [None]:
sns.countplot(df['target'])
plt.xlabel('Predictions');

0 -> sincere
1 -> unsincere

In [None]:
purcent_of_sincere = len(df['question_text'][df['target'] == 0]) / len(df['question_text']) * 100
purcent_of_unsincere = len(df['question_text'][df['target'] == 1]) / len(df['question_text']) * 100

sincere_len = len(df['question_text'][df['target'] == 0])
unsincere_len = len(df['question_text'][df['target'] == 1])

print("Purcent of sincere: {:.2f}%, {} questions".format(purcent_of_sincere, sincere_len))
print("Purcent of unsincere: {:.2f}%, {} questions".format(purcent_of_unsincere, unsincere_len))

# Difference of Lenght Distribution questions

In [None]:
sincere_lst_len = [len(df['question_text'][i]) for i in range(0, len(df['question_text'][df['target'] == 0])) if df['target'][i] == 0]
sincere_len_mean = np.array(sincere_lst_len).mean()
print("Mean of sincere questions: {:.0f} characters".format(sincere_len_mean))

In [None]:
unsincere_lst_len = [len(df['question_text'][i]) for i in range(0, len(df['question_text'][df['target'] == 1])) if df['target'][i] == 1]
unsincere_len_mean = np.array(unsincere_lst_len).mean()
print("Mean of unsincere questions: {:.0f} characters".format(unsincere_len_mean))

In [None]:
s1 = df[df['target'] == 0]['question_text'].str.len()
sns.distplot(s1, label='sincere')
s2 = df[df['target'] == 1]['question_text'].str.len()
sns.distplot(s2, label='unsincere')
plt.title('Lenght Distribution')
plt.legend();

## First word unsincere

In [None]:
first_word_unsincere = []
for sentence in df[df['target'] == 1]['question_text']:
    first_word_unsincere.append(sentence.split()[0])

In [None]:
from collections import Counter
counter_unsincere = Counter(first_word_unsincere)
counter_unsincere.most_common(10)

**NO conclusion here**  
**Too much different words**

## First word sincere

In [None]:
first_word_sincere = []
for sentence in df[df['target'] == 0]['question_text']:
    first_word_sincere.append(sentence.split()[0])

In [None]:
from collections import Counter
counter_sincere = Counter(first_word_sincere)
counter_sincere.most_common(10)

**NO conclusion here**  
**Too much different words**

# Preprocessing

### Word Tokenize on lower docs

In [None]:
tokenized_docs = [word_tokenize(doc.lower()) for doc in X_train]
tokenized_docs[0]

### Alpha Tokenize

In [None]:
alpha_tokens = [[t for t in doc if t.isalpha() == True] for doc in tokenized_docs]
alpha_tokens[0]

### Stop_words

In [None]:
stop_words = stopwords.words('english')

In [None]:
no_stop_tokens = [[t for t in doc if t not in stop_words] for doc in alpha_tokens]
no_stop_tokens[0]

### Stemmer

In [None]:
stemmer = PorterStemmer()

In [None]:
stemmed_tokens = [[stemmer.stem(t) for t in doc] for doc in no_stop_tokens]
stemmed_tokens[0]

# Count stemmed_tokens unsincere/sincere

In [None]:
X_temp = X_train.reset_index()
X_temp['temp'] = stemmed_tokens
X_temp.set_index('index', inplace=True)
X_temp.head()

In [None]:
X_temp = pd.concat([X_temp, y_train], axis=1, sort=False)
X_temp.head()

In [None]:
np_X_temp_index = np.array(X_temp.index)

In [None]:
lst = []
for idx in np_X_temp_index:
    lst.append(len(X_temp['temp'][idx]))

In [None]:
X_temp['count'] = lst
X_temp.head()

In [None]:
mean_count_sincere = X_temp['count'][X_temp['target'] == 0].mean()

In [None]:
print("Mean of preprocessed sincere words: {:.0f}".format(mean_count_sincere))

In [None]:
mean_count_unsincere = X_temp['count'][X_temp['target'] == 1].mean()

In [None]:
print("Mean of preprocessed unsincere words: {:.0f}".format(mean_count_unsincere))

# Latent semantic analysis

In [None]:
X_train_clean = [" ".join(x_t) for x_t in stemmed_tokens]
X_train_clean

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
from sklearn.pipeline import Pipeline
vectorizer = TfidfVectorizer(stop_words='english')
svd = TruncatedSVD(random_state=42)
preprocessing_pipe = Pipeline([('vectorizer', vectorizer), ('svd', svd)])

In [None]:
lsa_train = preprocessing_pipe.fit_transform(X_train_clean)
lsa_train.shape

In [None]:
sns.scatterplot(x=lsa_train[:10000, 0], y=lsa_train[:10000, 1], hue=y_train[:10000]);

In [None]:
components = pd.DataFrame(data=svd.components_, columns=preprocessing_pipe.named_steps['vectorizer'].get_feature_names(), index=['component_0', 'component_1'])
components

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 8))
for i, ax in enumerate(axes.flat):
    components.iloc[i].sort_values(ascending=False)[:10].sort_values().plot.barh(ax=ax)

# Machine Learning

## Countvectorizer

In [None]:
def cleaning(df):
    tokenized_docs = [word_tokenize(doc.lower()) for doc in df]
    alpha_tokens = [[t for t in doc if t.isalpha() == True] for doc in tokenized_docs]
    no_stop_tokens = [[t for t in doc if t not in stop_words] for doc in alpha_tokens]
    stemmed_tokens = [[stemmer.stem(t) for t in doc] for doc in no_stop_tokens]
    df_clean = [" ".join(x_t) for x_t in stemmed_tokens]
    return df_clean

In [None]:
X_test_clean = cleaning(X_test)
X_test_clean

## CountVectorizer-Unigrams

In [None]:
cvec_unigram = CountVectorizer(stop_words='english').fit(X_train_clean)

In [None]:
mb = MultinomialNB()

In [None]:
pipe = make_pipeline(cvec_unigram, mb)

In [None]:
pipe.fit(X_train_clean, y_train)

In [None]:
pipe.score(X_train_clean, y_train)

In [None]:
pipe.score(X_test_clean, y_test)

In [None]:
y_pred = pipe.predict(X_test_clean)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
scores = cross_val_score(pipe, X_train_clean, y_train, cv=5, scoring='f1')

In [None]:
scores

In [None]:
print("mean: {}".format(scores.mean()))
print("std: {}".format(scores.std()))

## CountVectorizer-bigrams

In [None]:
cvec_bigram = CountVectorizer(stop_words='english', ngram_range=(2, 2)).fit(X_train_clean)

In [None]:
mb = MultinomialNB()

In [None]:
pipe_bi = make_pipeline(cvec_bigram, mb)

In [None]:
pipe_bi.fit(X_train_clean, y_train)

In [None]:
pipe_bi.score(X_train_clean, y_train)

In [None]:
pipe_bi.score(X_test_clean, y_test)

In [None]:
y_pred_bi = pipe_bi.predict(X_test_clean)

In [None]:
confusion_matrix(y_test, y_pred_bi)

In [None]:
print(classification_report(y_test, y_pred_bi))

In [None]:
scores_bi = cross_val_score(pipe_bi, X_train_clean, y_train, cv=5, scoring='f1')

In [None]:
scores_bi

In [None]:
print("mean: {}".format(scores_bi.mean()))
print("std: {}".format(scores_bi.std()))

## CountVectorizer-trigrams

In [None]:
cvec_trigram = CountVectorizer(stop_words='english', ngram_range=(3, 3)).fit(X_train_clean)

In [None]:
mb = MultinomialNB()

In [None]:
pipe_tri = make_pipeline(cvec_trigram, mb)

In [None]:
pipe_tri.fit(X_train_clean, y_train)

In [None]:
pipe_tri.score(X_train_clean, y_train)

In [None]:
pipe_tri.score(X_test_clean, y_test)

In [None]:
y_pred_tri = pipe_tri.predict(X_test_clean)

In [None]:
confusion_matrix(y_test, y_pred_tri)

In [None]:
print(classification_report(y_test, y_pred_tri))

In [None]:
scores_tri = cross_val_score(pipe_tri, X_train_clean, y_train, cv=5, scoring='f1')

In [None]:
scores_tri

In [None]:
print("mean: {}".format(scores_tri.mean()))
print("std: {}".format(scores_tri.std()))

**Best score with ngram_range(1, 1): f1_score = 0.54**