# Reading in unstructured text data

## 1. READ IN TEXT DATA

### We use the Python open() function to read in the text file, so that the process is more flexible for even messier dataframes.

In [1]:
rawData = open("SMSSpamCollection.tsv").read()
rawData[0:300]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receiv"

### We can see that the values are seperated by a '\t' which means it is a tab-delimited text file. For tsv files, we can call the Pandas read_csv function and specify '\t' as the seperator.

In [2]:
parsedData = rawData.replace('\t', '\n').split('\n') # Replace tab characters with new-line characters
parsedData[0:4]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

In [3]:
labels = parsedData[0::2] # Every second element starting from the 0th index
texts = parsedData[1::2] # Every second element starting from the 1st index

print(labels[0:4])
print(texts[0:4])

['ham', 'spam', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.']


In [4]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)

In [5]:
dataFrame = pd.DataFrame({
    'label': labels[:-1], # Removing the last index from the labels list because it contains a blank element and arrays need to be equal in length
    'body': texts
})

dataFrame.head()

Unnamed: 0,label,body
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### To replicate the same process as above, we use the Pandas' approach. Note that the above used procedure is more flexible for even complex text files.

In [6]:
df = pd.read_csv("SMSSpamCollection.tsv", sep = "\t", header = None) # Importing tab-delimited file
df.columns = ["label", "body"] # Naming the columns
df.head()

Unnamed: 0,label,body
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


## 2. ANALYSING THE TEXT DATA

In [7]:
# Number of rows and columns in the dataset
df.shape

(5568, 2)

In [8]:
# Number of ham entries
print("Ham: {}".format(len(df[df['label'] == 'ham'])))

# Number of spam entries
print("Spam: {}".format(len(df[df['label'] == 'spam'])))

Ham: 4822
Spam: 746


In [9]:
# Checking for null values
df.isnull().sum()

label    0
body     0
dtype: int64

### The data doesn't contain any null values, so no further cleaning in that respect is required.

## 3. TOKENIZATION AND REMOVING STOPWORDS

### Removing punctuations

In [10]:
import string
string.punctuation[0:10] # Printing the first 10 punctuations in the string library

'!"#$%&\'()*'

In [11]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df['body_clean'] = df['body'].apply(lambda x: remove_punct(x))
df.head()

Unnamed: 0,label,body,body_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


### Tokenization

In [12]:
import re

def tokenizer(string):
    token = re.split("\W+", string)
    return token

df['body_clean'] = df['body_clean'].apply(lambda x: tokenizer(x.lower()))
df.head()

Unnamed: 0,label,body,body_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[i, have, a, date, on, sunday, with, will]"


### Removing Stopwords

In [13]:
import nltk

stopword = nltk.corpus.stopwords.words('english') # Defining stopwords
stopword[0:5]

['i', 'me', 'my', 'myself', 'we']

In [14]:
def remove_stopwords(text):
    text_nostopword = [word for word in text if word not in stopword]
    return text_nostopword

df['body_clean'] = df['body_clean'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,label,body,body_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"
