# NLP Basics: Reading in text data & why do we need to clean the text?

In [14]:
# Import necessary modules
import pandas as pd

### Read in semi-structured text data

In [2]:
# Read in the raw text in an unstructured format
rawData = open("data/SMSSpamCollection.tsv").read()

# Print the first 500 characters
rawData[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [7]:
# Replace all tabs with newline then split the string to return a list
parsedData = rawData.replace('\t', '\n').split('\n')

# Check data type
print("parsedData is of the type: ",type(parsedData))
print()

# Print first 5 lines
print(parsedData[0:5])

parsedData is of the type:  <class 'list'>

['ham', "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", 'spam', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'ham']


In [13]:
# Pull the labels and text from the parsed data separately
labelList = parsedData[0::2]
textList = parsedData[1::2]

# Print first 5 lables
print(labelList[0:5])

# Print first 5 text
print(textList[0:5])

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


In [20]:
# Create a pandas dataframe with key-value pairs
fullCorpus = pd.DataFrame({
    'label': labelList[:-1], # We don't take the last empty element from labelList 
    'body_list':textList})

# Check the dataframe
fullCorpus.head()

Unnamed: 0,label,body_list
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Read in semi-structured text data using Pandas

In [25]:
# Read using pandas
dataset = pd.read_csv('data/SMSSpamCollection.tsv', sep='\t', header=None)

# Check the dataframe
dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
