# Data Pre-Processing

In [35]:
import pandas as pd
import numpy as np
import csv
import string
import re

## Constants

In [77]:
readFilename = "../data/SMSSpamCollection.txt"
writeFilename = "../data/transormed.csv"
processedFilename = "../data/processedFile.csv"
uniformProcessedFilename = "../data/uniformDataProcessedFile.csv"
seed = 1
np.random.seed(seed)

## Transforming file into CSV while removing punctuation

In [3]:
def readFile(filename):
    """
    Function to read file with a given filename
    :param filename : String. e.g. "../data/SMSSpamCollection.txt"
    :return String. Content of the file
    """
    with open(filename,'r', encoding='utf-8') as fileHandler:
        content = fileHandler.read()
        
    return content
    
content = readFile(readFilename)

In [10]:
def writeFile(filename, content):
    """
    Function to write the file with the given content 
    :param filename : String. e.g. "../data/preProcessed.csv"
    :param content : String
    :return : None
    """
    with open(filename, "w") as fileHandler:
        spamWriter = csv.writer(fileHandler)
        pattern = re.compile('[^A-Za-z0-9]+')
        for sentence in content.split("\n"):
            sentence = sentence.strip()
            line = sentence.split("\t")
            try:
                text = str("".join(line[1:])).strip().lower()
                text = re.sub(pattern, " ", text)
                if line[0]=='ham':
                    spamWriter.writerow([text.strip(), 0])
                if line[0]=='spam':
                    spamWriter.writerow([text, 1])
            except Exception as e:
                print ("Exception {0} occured".format(e))
            
writeFile(writeFilename, content)                               
        

In [11]:
df = pd.read_csv(writeFilename, header = None)
print (len(df))
df.head()

5574


Unnamed: 0,0,1
0,go until jurong point crazy available only in ...,0
1,ok lar joking wif u oni,0
2,free entry in 2 a wkly comp to win fa cup fina...,1
3,u dun say so early hor u c already then say,0
4,nah i don t think he goes to usf he lives arou...,0


In [12]:
df[1].describe()

count    5574.000000
mean        0.134015
std         0.340699
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: 1, dtype: float64

## Pre-processing

In [27]:
df = df.drop_duplicates(subset=0, keep='first')
df = df.dropna()
len(df)

5139

In [28]:
df.head()

Unnamed: 0,0,1
0,go until jurong point crazy available only in ...,0
1,ok lar joking wif u oni,0
2,free entry in 2 a wkly comp to win fa cup fina...,1
3,u dun say so early hor u c already then say,0
4,nah i don t think he goes to usf he lives arou...,0


In [29]:
df.to_csv(processedFilename,header=None,index=False)

In [30]:
df[0].values

array([ 'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
       'ok lar joking wif u oni',
       'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry question std txt rate t c s apply 08452810075over18 s',
       ..., 'pity was in mood for that so any other suggestions',
       'the guy did some bitching but i acted like i d be interested in buying something else next week and he gave it to us for free',
       'rofl its true to its name'], dtype=object)

## Uniformily distributing the data

In [107]:
spamDf = df[df[1]==1]
print ("Length of spam Df : {0}".format(len(spamDf)))
nonSpamDf = df[df[1]==0]
print ("Length of non spam Df : {0}".format(len(nonSpamDf)))

Length of spam Df : 639
Length of non spam Df : 4500


Extracting random 639 values from nonSpamDf

In [108]:
msk = np.random.rand(len(nonSpamDf)) < 0.140
print ("Length of nonSpamDf : {0}".format(len(nonSpamDf[msk])))
nonSpamDf = nonSpamDf[msk]

Length of nonSpamDf : 654


In [111]:
uniformDf = pd.concat([spamDf, nonSpamDf])
print ("No of records in new DataFrame : {0}".format(len(uniformDf)))
uniformDf.head()

No of records in new DataFrame : 1293


Unnamed: 0,0,1
2,free entry in 2 a wkly comp to win fa cup fina...,1
5,freemsg hey there darling it s been 3 week s n...,1
8,winner as a valued network customer you have b...,1
9,had your mobile 11 months or more u r entitled...,1
11,six chances to win cash from 100 to 20 000 pou...,1


In [112]:
uniformDf = uniformDf.sample(frac=1).reset_index(drop=True)
uniformDf.head()

Unnamed: 0,0,1
0,hi its lucy hubby at meetins all day fri i wil...,1
1,think ur smart win 200 this week in our weekly...,1
2,guess what somebody you know secretly fancies ...,1
3,sorry for the delay yes masters,0
4,you ve won tkts to the euro2004 cup final or 8...,1


In [115]:
uniformDf.to_csv(uniformProcessedFilename,header=None,index=False)

In [113]:
uniformDf.describe()

Unnamed: 0,1
count,1293.0
mean,0.4942
std,0.50016
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [114]:
from collections import Counter
Counter(uniformDf[1].values)

Counter({0: 654, 1: 639})

We have a uniform distribution of data