In [2]:
# -*- coding: utf-8 -*-
# Indentation: Jupyter Notebook

'''
Text cleaning using NLP
'''

__version__ = 1.0
__author__ = "Sourav Raj"
__author_email__ = "souravraj.iitbbs@gmail.com"


# NLP Basics: Reading in text data & why do we need to clean the text?

### Read in semi-structured text data

In [1]:
# Read in the raw text
rawData = open("../../data/SMSSpamCollection.tsv").read()

# Print the raw data
rawData[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [2]:
parsedData = rawData.replace('\t', '\n').split('\n')

In [3]:
parsedData[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [4]:
labelList = parsedData[0::2]
textList=parsedData[1::2]

In [5]:
labelList[0:5]

['ham', 'spam', 'ham', 'ham', 'ham']

In [6]:
textList[0:5]

["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 "Nah I don't think he goes to usf, he lives around here though",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 'I HAVE A DATE ON SUNDAY WITH WILL!!']

In [7]:
print(len(labelList))
print(len(textList))

5571
5570


In [8]:
print(labelList[-5:])

['ham', 'ham', 'ham', 'ham', '']


In [9]:
import pandas as pd
fullCorpus =pd.DataFrame({
    'label':labelList[:-1],
    'body_list': textList
})
fullCorpus.head()

Unnamed: 0,body_list,label
0,I've been searching for the right words to tha...,ham
1,Free entry in 2 a wkly comp to win FA Cup fina...,spam
2,"Nah I don't think he goes to usf, he lives aro...",ham
3,Even my brother is not like to speak with me. ...,ham
4,I HAVE A DATE ON SUNDAY WITH WILL!!,ham


In [11]:
fullCorpus = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None)
fullCorpus.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [21]:
fullCorpus.columns=['label', 'body_text']


Explore the dataset

In [22]:
print('Input data has {} rows and {} columns'.format(len(fullCorpus), len(fullCorpus.columns)))

Input data has 5568 rows and 2 columns


In [23]:
print('out of {}  rows, {} are spam, {} are ham'.format(len(fullCorpus),
                                                       len(fullCorpus[fullCorpus['label']=='spam']),
                                                       len(fullCorpus[fullCorpus['label']=='ham'])))

out of 5568  rows, 746 are spam, 4822 are ham


In [24]:
print('no of null in label:{}'.format(fullCorpus['label'].isnull().sum()))
print('no of null in text:{}'.format(fullCorpus['body_text'].isnull().sum()))

no of null in label:0
no of null in text:0


Regex

In [25]:
import re

In [26]:
re_test='This is a made up string to test 2 different regex methods'
re_test_messy='This     is a made up       string to test 2     diff regex methods'
re_test_messy1 ='This-is-a-made/up.string*to>>>>test-----2*****diff-regex-method'

In [27]:
re.split('\s', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [28]:
re.split('\s', re_test_messy)

['This',
 '',
 '',
 '',
 '',
 'is',
 'a',
 'made',
 'up',
 '',
 '',
 '',
 '',
 '',
 '',
 'string',
 'to',
 'test',
 '2',
 '',
 '',
 '',
 '',
 'diff',
 'regex',
 'methods']

In [29]:
re.split('\s+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'diff',
 'regex',
 'methods']

In [30]:
re.split('\W+', re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'diff',
 'regex',
 'method']

In [36]:
re.findall('\S+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'diff',
 'regex',
 'methods']

In [34]:
re.findall('\S+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test-----2*****diff-regex-method']

# Replace a special string in sent

In [37]:
pep8_test ='I try to follow PEP8 guidelines'
pep7_test ='I try to follow PEP7 guidelines'
peep8_test='I try to follow PEEP8 guidelines'

Replace PEP8 with PEP8 python & also typo mistake

In [41]:
re.findall('[a-z]+', pep8_test)

['try', 'to', 'follow', 'guidelines']

so findall is casesensitive

In [42]:
re.findall('[A-Z]+', pep8_test)

['I', 'PEP']

In [43]:
re.findall('[A-Z]+[0-9]+', pep8_test)

['PEP8']

In [44]:
re.findall('[A-Z]+[0-9]+', pep7_test)

['PEP7']

In [45]:
re.findall('[A-Z]+[0-9]+', peep8_test)

['PEEP8']

In [46]:
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Styleguide', pep8_test)

'I try to follow PEP8 Python Styleguide guidelines'

In [47]:
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Styleguide', pep7_test)

'I try to follow PEP8 Python Styleguide guidelines'

In [48]:
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Styleguide', peep8_test)

'I try to follow PEP8 Python Styleguide guidelines'