# NLP Basics: Reading in text data & why do we need to clean the text?

### Read in semi-structured text data

In [2]:
# Read in the raw text
rawData = open("SMSSpamCollection.tsv").read()

# Print the raw data
rawData[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [3]:
# change from tab to new line so we can split this text easily.
parsedData = rawData.replace("\t","\n").split("\n")

In [4]:
parsedData[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [5]:
# create a new list to label the data.
# as we notice from our text, first index is ham then followed by second index is text then third index spam then followed by fourth index is text
# there is pattern here where [0] index is ham > [1] is text > [2] is spam and so on
labelList = parsedData[0::2]
textList = parsedData[1::2]

In [6]:
print(labelList[0:5])
print(textList[0:5])

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


In [10]:
import pandas as pd

# store the labels and text into dataframe and we can store as dictionary
fullCorpus = pd.DataFrame({
    'label': labelList[:-1], # edit here instead taking all change to not taking the last one.
    'body_list': textList
})

In [11]:
print(len(labelList))
print(len(textList))

5571
5570


In [12]:
print(labelList[-5:]) # apparently there is one extra entry in the labelist

['ham', 'ham', 'ham', 'ham', '']


In [13]:
# Now we have the clean version of the text
fullCorpus

Unnamed: 0,label,body_list
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
...,...,...
5565,spam,This is the 2nd time we have tried 2 contact u...
5566,ham,Will ü b going to esplanade fr home?
5567,ham,"Pity, * was in mood for that. So...any other s..."
5568,ham,The guy did some bitching but I acted like i'd...


In [16]:
# we can read the data with read_csv since we know it has separator of \t in the text.
dataset = pd.read_csv("SMSSpamCollection.tsv", sep='\t', header=None)
dataset.columns = ["label","body_text"]
dataset

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
...,...,...
5563,spam,This is the 2nd time we have tried 2 contact u...
5564,ham,Will ü b going to esplanade fr home?
5565,ham,"Pity, * was in mood for that. So...any other s..."
5566,ham,The guy did some bitching but I acted like i'd...


### Explore the dataset

In [18]:
# What is the shape of the dataset?
print("Input data has {} rows and  {} columns".format(len(dataset),len(dataset.columns)))

Input data has 5568 rows and  2 columns


In [20]:
# How many spam/ham are there?
dataset['label'].value_counts()

ham     4822
spam     746
Name: label, dtype: int64

In [22]:
# Are they any missing values?
dataset.isnull().sum()
print("Number of null in label: {}".format(dataset['label'].isnull().sum()))
print("Number of null in text: {}".format(dataset['body_text'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0
