First, let's import required packages. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

Then, we read data and check first couple of rows as well as its size.

In [2]:
trainData = pd.read_csv('../data/raw/train.csv')

As the dataset is relatively large with 1306122 rows and we will have some difficulty if the whole dataset is considered. Therefore, we do random stratified sampling and keep only 10% of data.

In [3]:
trainData = trainData.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=0.10))

In [4]:
trainData.head()

Unnamed: 0,qid,question_text,target
164760,2034f545f912e1307d1c,What are some useful tips for students startin...,0
393424,4d116d58705ed618de29,Would it be illegal for Trump to pardon Manafo...,0
38615,078ced652576347901f6,What is the best remedy to get rid of celiac d...,0
71401,0dff889c225f875bcba0,Is it better to use rd Sharma in class 11?,0
224206,2bd86f5edfa5f801f90c,I want to do research in astronomy. Which qual...,0


From figure above, insincere questions seem to have more words and characters when compared to sincere questions. Next we work on preprocessing questions text by removing extra characters and stopwords and changing all letters to lower case. 

In [5]:
def removeExtraChars(qtext):
    result = [re.sub(r'[^A-Za-z ]+', ' ', sentence) for sentence in qtext]
    return result

In [6]:
def removeStopWords(qtext):
    # Remove stop words
    result = []
    for sentence in qtext:
        words = sentence.split()
        result.append(' '.join([w for w in words if w not in STOPWORDS]))
    return result

In [7]:
def preprocessText(df):
    
    # Remove extra characters
    df['question_text'] = removeExtraChars(df['question_text'])
    
    # Lower case
    df['question_text'] = [sentence.lower() for sentence in df['question_text']]
    
    # Remove stop words
    df['question_text'] = removeStopWords(df['question_text'])
    
    return df

In [8]:
class preprocessTextTransformer():
    
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

In [9]:
pipeline = Pipeline([
    ("preprocess", preprocessTextTransformer(preprocessText))
])

In [10]:
_ = pipeline.fit_transform(trainData)

In [11]:
trainData.head()

Unnamed: 0,qid,question_text,target
164760,2034f545f912e1307d1c,useful tips students starting first semester m...,0
393424,4d116d58705ed618de29,illegal trump pardon manafort order shut,0
38615,078ced652576347901f6,best remedy rid celiac disease,0
71401,0dff889c225f875bcba0,better use rd sharma class,0
224206,2bd86f5edfa5f801f90c,want research astronomy qualifications must,0


After preprocessing question text, we tokenize and stem sentences and calculate document-term matrix (DTM).

In [12]:
#def tokenize(sentence):
#    tokens = [word for word in nltk.word_tokenize(sentence) if len(word)>1]
#    stemmer = PorterStemmer()
#    stems = [stemmer.stem(item) for item in tokens]
#    return stems

#def createDTM(qtext):
#    vectorizer = TfidfVectorizer(tokenizer = tokenize)
#    dtm = vectorizer.fit_transform(qtext)
#    result = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names())
#    return result

In [13]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])

In [14]:
features = pipeline.fit_transform(trainData)

In [15]:
features.shape

(3, 3)

In [16]:
features

<3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>