In [1]:
import pandas as pd
import numpy as np
from spellchecker import SpellChecker
import multiprocessing as mp
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk

In [2]:
train_data = pd.read_csv('Data/quora-insincere-questions-classification/train.csv')

print(train_data.shape)

(1306122, 3)


In [3]:
train_data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


## Data Preprocessing

In [4]:
## Function to convert text to lowercase
train_data['question_text'] = train_data['question_text'].str.lower()

In [5]:
## Function to expand contractions
def expand_contractions(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

train_data['question_text'] = train_data['question_text'].apply(expand_contractions)

In [6]:
## Function to remove punctuations
def clean_punctuations(text):
    for punct in "/-":
        text = text.replace(punct, ' ')
    for punct in '&':
        text = text.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²--':
        text = text.replace(punct, '')
    return text

train_data["question_text"] = train_data["question_text"].apply(clean_punctuations)

In [13]:
def get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color', 'centre':'center', 'favourite':'favorite', 'youtu': 'youtube', 'Qoura': 'Quora', 'narcisist': 'narcissist', 'langague': 'language',
                'travelling':'traveling', 'counselling':'counseling', 'theatre':'theater', 'cancelled':'canceled', 'labour':'labor', 'organisation':'organization',
                'wwii':'world war 2', 'citicise':'criticize', 
                'behaviour': 'behavior', 'demonitization': 'demonetization', 'fiancee': 'fiance', 'remainers': 'remainder', 'qouta': 'quota', 'qoura': 'quora',
                'programme': 'program', 'organisations': 'organization', 'licence': 'license',  'organisation': 'organization',
               'Whatis': 'what is', 'favour': 'favor', 'learnt': 'learn', 'defence': 'defense', 'recognise': 'recognize',
               'recognised': 'recognize', 'practise': 'practice', 'neighbour': 'neighbor', 'programr': 'programmer', 'realise': 'realize', 'Didnt':'did not',
               'theatre': 'theater', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'nearbuy': 'nearby',
                'demonetisation': 'demonetization', 'narcissit': 'narcissist', 'bigdata': 'big data', 'Qoura': 'Quora', 'sallary': 'salary', 'statergy': 'strategy',
               'analyse': 'analyze', 'intenship': 'internship', 'simpliv': 'simple', 'trignometric': 'trigonometric', 'econimics': 'economics'}
mispellings, mispellings_re = get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

train_data["question_text"] = train_data["question_text"].apply(lambda x: replace_typical_misspell(x))

In [14]:
## Function to remove stopwords
STOPWORDS = stopwords.words('english')
def clean_stopwords(text):
    string = ' '.join([token for token in text.split(' ') if token not in STOPWORDS])
    return string

train_data["question_text"] = train_data["question_text"].apply(clean_stopwords)  

In [15]:
## Function to remove numerals
def clean_numbers(text):
    text = re.sub('(?<=[0-9][0-9])th', '', text)
    text = re.sub('[0-9]+', '', text)
    return text

train_data["question_text"] = train_data["question_text"].apply(clean_numbers)

In [3]:
## Function to lemmatize words
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    string = ' '.join([lemmatizer.lemmatize(token) for token in text.split(' ')])
    return string

#train_data['question_text'] = train_data['question_text'].apply(lemmatize)    

In [18]:
train_data.to_csv('cleaned_quora_data.csv', index=False)

In [6]:
lemmatizer.lemmatize("approached")

'approached'