### Data Preprocessing

We have to preprocess the text data as a next step.
For this we have to do the following:
- Convert to lower case
- Removing html tags
- Removing Punctuations
- Performing stemming
- Removing Stopwords
- Expanding contractions etc.

In [54]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os
import seaborn as sns
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.io as pio
from plotly.offline import plot , iplot ,init_notebook_mode
from bs4 import BeautifulSoup
import string
from nltk.stem.porter import PorterStemmer

init_notebook_mode(connected=True)
%matplotlib inline

warnings.filterwarnings('ignore')

##### Lets reload the data

In [2]:
questions_data = pd.read_csv('./Data/question_feature_extracted.csv')
questions_data.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1_len,q2_len,q1_n_words,q2_n_words,words_common,total_unique_words,words_shared_bw_qs,freq_q1+q2,freq_q1-q2
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,10,23,0.434783,2,0
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,1,51,88,8,13,4,20,0.2,5,3
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1,1,73,59,14,10,4,24,0.166667,2,0
3,3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,1,1,50,65,11,9,0,19,0.0,2,0
4,4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3,1,76,39,13,7,2,20,0.1,4,2


##### Remove HTML Tags

In [49]:
# Remove HTML Tags
def remove_html(text):
    soup = BeautifulSoup(text,'lxml')
    html_free_text = soup.get_text()
    return html_free_text

In [50]:
# Remove Punctuations
def punctuation_remover(text):
    punctuation_free_text = "".join([char for char in text if char \
                                    not in string.punctuation])
    return punctuation_free_text

In [55]:
# Stemming
stemmer = PorterStemmer()

def word_stemmer(text):
    stemmed_text = " ".join(stemmer.stem(word) for word in text)
    return stemmed_text


In [None]:
# Stop Word Removal


In [46]:
# lets create a preprocessing function
def text_prepocressing(text):
    import re
    text = text.lower() #Lower Case
    text = remove_html(text) #Remove HTML Tags
     # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r"([0-9]+)000000", r"\1m", text) # 1 million
    text = re.sub(r"([0-9]+)000", r"\1k", text) # 1 Thousand
    
    # remove comma between numbers, i.e. 15,000 -> 15000
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

    text = punctuation_remover(text)    

    return text


In [47]:
print(text_prepocressing('HI<>sdhsdf like 10,00.00, i \'ll do i++t' ))

hi sdhsdf like 10 00 00 i ll do i t
