## Loading dataset

Import libraries needed in this notebook

In [None]:
import pandas as pd 
import numpy as np 
%matplotlib inline
import matplotlib as mp 
import matplotlib.pyplot as plt
import seaborn as sns


here are the first five rows

In [None]:
data_raw = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
data_raw.head()

dataset attributes and their data types

In [None]:
data_raw.info()

In [None]:
data_raw.dropna(inplace=True)

Number of sincere and insincere questions in this dataset

In [None]:
data_raw.target.value_counts()

Let's plot their counts

In [None]:
sns.countplot(data=data_raw, x='target')

In [None]:
val_counts = data_raw.target.value_counts()
sincere_q_pc = val_counts[0]/val_counts.sum()
sincere_q_pc = sincere_q_pc*100
print('{}% of questions are sincere while the rest are insincere'.format(sincere_q_pc))

The vast majority of questions in this dataset are sincere and only a small number are insincere 

Let us take a look at the questions in this dataset and try finding out how they have been classified as sincere and insincere.

In [None]:
insincere_questions = data_raw[data_raw['target'] == 1].question_text
sincere_questions = data_raw[data_raw['target'] == 0].question_text

In [None]:
insincere_questions.sample(3, random_state=1).values

Sweet lord, do these questions look pathetic!

These are examples of some of the sincere questions

In [None]:
sincere_questions.sample(3, random_state=1).values

## EDA and Visualization

- Before we start analysing text data, it is important that it is filtered of stop words, punctuations, contractions, and every document of text is tokenized.

- After cleaning up and tokenizing text, we need to convert every word to its root form so analysis and querying of data can be sped up.

### Cleaning of text data and preprocessing

Importing libraries for text clean up

In [None]:
import nltk
import sys
import spacy

#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

from nltk.corpus import stopwords

import string

Function for cleaning text

In [None]:
nlp = spacy.load("en_core_web_sm", disable=['parser','ner'])
stop = set(stopwords.words('english'))
punc = set(string.punctuation)

def clean_text(text):
    # Convert the text into lowercase
    text = text.lower()
    # Split into list
    wordList = text.split()
    # Remove punctuation
    wordList = ["".join(x for x in word if (x=="'")|(x not in punc)) for word in wordList]
    # Remove stop words
    wordList = [word for word in wordList if word not in stop]

    reformed_sentence = " ".join(wordList)
    doc = nlp(reformed_sentence)
    return " ".join([token.lemma_ for token in doc])

Let us take a sample question and run this function on it to see if it works

In [None]:
question = data_raw.question_text.sample(1, random_state=1).values[0]
question

In [None]:
clean_text(question)

Nice. Our text clean up function works

Now we will clean every row of text data by running this function

In [None]:
data_raw['clean_text'] = data_raw['question_text'].astype('str').apply(clean_text)

### Visualizations

Build word clouds of sincere and insincere questions to find the most frequently occuring words in each.

Import the wordcloud library

In [None]:
from wordcloud import WordCloud, ImageColorGenerator

function for splitting sentences into a dictionary of uniquely occuring words and their frequencies

In [None]:
def word_freq_dict(text):
    # Convert text into word list
    wordList = text.split()
    # Generate word freq dictionary
    wordFreqDict = {word: wordList.count(word) for word in wordList}
    return wordFreqDict

function for plotting a wordcloud from a word frequency dictionary

In [None]:
def word_cloud_from_frequency(word_freq_dict, title, figure_size=(10,6)):
    wordcloud.generate_from_frequencies(word_freq_dict)
    plt.figure(figsize=figure_size)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title)
    plt.show()

Wordcloud of a random sample of 1000 insincere questions

In [None]:
# join every question as a single sample of text
insincere_questions = data_raw.clean_text[data_raw['target'] == 1]
insincere_sample = " ".join(insincere_questions.sample(1000, random_state=1).values)
insincere_word_freq = word_freq_dict(insincere_sample)
wordcloud = WordCloud(width= 5000,
    height=3000,
    max_words=200,
    colormap='Reds',
    background_color='white')

word_cloud_from_frequency(insincere_word_freq, "Most Frequent Words in a sample of 1000 questions flagged insincere") 

Wordcloud of a random sample of sincere questions

In [None]:
sincere_questions = data_raw[data_raw['target'] == 0].clean_text
sincere_sample = " ".join(sincere_questions.sample(1000, random_state=1).astype('str').values)
sincere_word_freq = word_freq_dict(sincere_sample)
wordcloud = WordCloud(width= 5000,
    height=3000,
    max_words=200,
    colormap='Greens',
    background_color='white')

word_cloud_from_frequency(sincere_word_freq, "Most Frequent Words in a sample of 1000 questions flagged sincere") 

## Text to vector matrix conversion 

We need to transform text into a matrix of vectors

### Using Bag of words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_converter = CountVectorizer()

In [None]:
sample_question_text = data_raw['clean_text'].sample(1, random_state= 1).values
sample_question_text

In [None]:
sample_count_vectorized_data = bow_converter.fit_transform(sample_question_text)
sample_count_vectorized_data.toarray()

In [None]:
count_vectorized_data_feature_names = bow_converter.get_feature_names()
count_vectorized_data_feature_names

In [None]:
len(count_vectorized_data_feature_names)

### Using Tfidf(term-frequency-inverse-document-frequency) model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_converter = TfidfVectorizer(ngram_range=(1,1))

In [None]:
sample_tfidf_vectorized_data = tfidf_converter.fit_transform(sample_question_text)
sample_tfidf_vectorized_data.toarray()

In [None]:
tfidf_word_feature_names = tfidf_converter.get_feature_names()

In [None]:
tfidf_word_feature_names

In [None]:
len(tfidf_word_feature_names)

## Building a logistic regression model

### Pipeline with logistic regression and count vectorizer

define count vactorizer and logistic regression models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

count_vectorizer = CountVectorizer()
model = LogisticRegression(C=1, random_state=0)

vectorize_logit_pipeline = Pipeline([
    ('count_vectorizer', count_vectorizer),
    ('logit', model)
])

- define input and target variables
- split training dataset into train and test sets
- train the model using the feature and target training sets

In [None]:
# input variable
X = data_raw['clean_text']
# target variable
y = data_raw['target']

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3)
vectorize_logit_pipeline.fit(train_X, train_y)

Get the predictions from the model

In [None]:
predictions = vectorize_logit_pipeline.predict(test_X)

Check the accuracy score

In [None]:
accuracy_score(test_y, predictions)

Check the f1 score and plot the confusion matrix

In [None]:
from sklearn.metrics import f1_score
f1_score(test_y, predictions)

In [None]:
confusion_matrix_logit_cv = confusion_matrix(test_y, predictions)
sns.heatmap(confusion_matrix_logit_cv, annot= True, xticklabels=['sincere', 'insincere'], yticklabels=['sincere', 'insincere'])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_y, predictions))

### Define a pipeline with logistic regression and tfidf bi-grams vectorizer

In [None]:
tfidf_ngrams_converter = TfidfVectorizer(ngram_range=(1,2))
tfidf_ngrams_logit_pipeline = Pipeline([
    ('tfidf_vectorizer', tfidf_ngrams_converter),
    ('logit', model)
])

In [None]:
tfidf_ngrams_logit_pipeline.fit(train_X, train_y)

In [None]:
new_predictions = tfidf_ngrams_logit_pipeline.predict(test_X)

In [None]:
accuracy_score(test_y, new_predictions)

In [None]:
f1_score(test_y, new_predictions)

In [None]:
confusion_matrix_logit_tfidf = confusion_matrix(test_y, new_predictions)
sns.heatmap(confusion_matrix_logit_tfidf, annot= True, xticklabels=['sincere', 'insincere'], yticklabels=['sincere', 'insincere'])

In [None]:
print(classification_report(test_y, new_predictions, target_names=['sincere', 'insincere']))

we can observe that the when bigram word vector features are included, our logit model gave us the best accuracy and f1 scores.

In [None]:
test_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')
test_data.head()

In [None]:
test_data.info()

In [None]:
test_data['clean_text'] = test_data['question_text'].astype('str').apply(clean_text)

In [None]:
test_data.head()

In [None]:
X_final = test_data['clean_text']

In [None]:
y_final = tfidf_ngrams_logit_pipeline.predict(X_final)

In [None]:
y_final[:5]

In [None]:
test_data['target'] = y_final

In [None]:
result_df = test_data[['qid', 'target']]

In [None]:
result_df.rename(columns={'target': 'prediction'}, inplace=True)
result_df.set_index('qid', inplace=True)
result_df.head()

In [None]:
result_df.to_csv('submission.csv')
!head submission.csv