# Data Pre-Processing

In [16]:
import pandas as pd
import csv
import string
import re

## Constants

In [2]:
readFilename = "../data/SMSSpamCollection.txt"
writeFilename = "../data/transormed.csv"
processedFilename = "../data/processedFile.csv"

## Transforming file into CSV while removing punctuation

In [3]:
def readFile(filename):
    """
    Function to read file with a given filename
    :param filename : String. e.g. "../data/SMSSpamCollection.txt"
    :return String. Content of the file
    """
    with open(filename,'r', encoding='utf-8') as fileHandler:
        content = fileHandler.read()
        
    return content
    
content = readFile(readFilename)

In [10]:
def writeFile(filename, content):
    """
    Function to write the file with the given content 
    :param filename : String. e.g. "../data/preProcessed.csv"
    :param content : String
    :return : None
    """
    with open(filename, "w") as fileHandler:
        spamWriter = csv.writer(fileHandler)
        pattern = re.compile('[^A-Za-z0-9]+')
        for sentence in content.split("\n"):
            sentence = sentence.strip()
            line = sentence.split("\t")
            try:
                text = str("".join(line[1:])).strip().lower()
                text = re.sub(pattern, " ", text)
                if line[0]=='ham':
                    spamWriter.writerow([text.strip(), 0])
                if line[0]=='spam':
                    spamWriter.writerow([text, 1])
            except Exception as e:
                print ("Exception {0} occured".format(e))
            
writeFile(writeFilename, content)                               
        

In [11]:
df = pd.read_csv(writeFilename, header = None)
print (len(df))
df.head()

5574


Unnamed: 0,0,1
0,go until jurong point crazy available only in ...,0
1,ok lar joking wif u oni,0
2,free entry in 2 a wkly comp to win fa cup fina...,1
3,u dun say so early hor u c already then say,0
4,nah i don t think he goes to usf he lives arou...,0


In [12]:
df[1].describe()

count    5574.000000
mean        0.134015
std         0.340699
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: 1, dtype: float64

## Pre-processing

In [27]:
df = df.drop_duplicates(subset=0, keep='first')
df = df.dropna()
len(df)

5139

In [28]:
df.head()

Unnamed: 0,0,1
0,go until jurong point crazy available only in ...,0
1,ok lar joking wif u oni,0
2,free entry in 2 a wkly comp to win fa cup fina...,1
3,u dun say so early hor u c already then say,0
4,nah i don t think he goes to usf he lives arou...,0


In [29]:
df.to_csv(processedFilename,header=None,index=False)

In [30]:
df[0].values

array([ 'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
       'ok lar joking wif u oni',
       'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry question std txt rate t c s apply 08452810075over18 s',
       ..., 'pity was in mood for that so any other suggestions',
       'the guy did some bitching but i acted like i d be interested in buying something else next week and he gave it to us for free',
       'rofl its true to its name'], dtype=object)