First, let's import required packages. 

In [51]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

Then, we read data and check first couple of rows as well as its size.

In [39]:
trainData = pd.read_csv('../data/raw/train.csv')

As the dataset is relatively large with 1306122 rows and we will have some difficulty if the whole dataset is considered. Therefore, we do random stratified sampling and keep only 10% of data.

In [40]:
trainData = trainData.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=0.10))

In [41]:
trainData.head()

Unnamed: 0,qid,question_text,target
32534,065e89e416fa7417ce52,Where can I find research papers on linear reg...,0
651377,7f964b3baac64cb51f0d,What would it have to take to start a war betw...,0
1263507,f79c250be3a9332c638e,What is the least common Myers-Briggs personal...,0
945005,b92d68c708b930e732ed,What property does helium have that made it to...,0
1017393,c75d1d9b64ebbaa5bad2,What is your percentage in CBSE class 10 2018?,0


From figure above, insincere questions seem to have more words and characters when compared to sincere questions. Next we work on preprocessing questions text by removing extra characters and stopwords and changing all letters to lower case. 

In [42]:
def removeExtraChars(qtext):
    result = [re.sub(r'[^A-Za-z ]+', ' ', sentence) for sentence in qtext]
    return result

In [43]:
def removeStopWords(qtext):
    # Remove stop words
    result = []
    for sentence in qtext:
        words = sentence.split()
        result.append(' '.join([w for w in words if w not in STOPWORDS]))
    return result

In [44]:
def preprocessText(df):
    
    # Remove extra characters
    df['question_text'] = removeExtraChars(df['question_text'])
    
    # Lower case
    df['question_text'] = [sentence.lower() for sentence in df['question_text']]
    
    # Remove stop words
    df['question_text'] = removeStopWords(df['question_text'])
    
    return df

In [45]:
class preprocessTextTransformer():
    
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

In [46]:
pipeline = Pipeline([
    ("preprocess", preprocessTextTransformer(preprocessText))
])

In [47]:
_ = pipeline.fit_transform(trainData)

In [48]:
trainData.head()

Unnamed: 0,qid,question_text,target
32534,065e89e416fa7417ce52,find research papers linear regression,0
651377,7f964b3baac64cb51f0d,take start war us china,0
1263507,f79c250be3a9332c638e,least common myers briggs personality type,0
945005,b92d68c708b930e732ed,property helium made used filling balloons,0
1017393,c75d1d9b64ebbaa5bad2,percentage cbse class,0


After preprocessing question text, we tokenize and stem sentences and calculate document-term matrix (DTM).

In [49]:
#def tokenize(sentence):
#    tokens = [word for word in nltk.word_tokenize(sentence) if len(word)>1]
#    stemmer = PorterStemmer()
#    stems = [stemmer.stem(item) for item in tokens]
#    return stems

#def createDTM(qtext):
#    vectorizer = TfidfVectorizer(tokenizer = tokenize)
#    dtm = vectorizer.fit_transform(qtext)
#    result = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names())
#    return result

In [52]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])

In [54]:
features = pipeline.fit_transform(trainData)

In [55]:
features.shape

(3, 3)

In [57]:
features[0][0]

<1x3 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>