In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sqlite3    ## SQL Interface
import pickle     ## Used to save your data - Converts objects to byte stream and vice versa

import re
import nltk
from nltk.corpus import stopwords

In [2]:
conn = sqlite3.connect('Emails.sqlite')

final = pd.read_sql_query("""SELECT * FROM Emails""", conn)

final.head()

Unnamed: 0,Emails,lable
0,"slur . . . mean , sir : armey 's slip slip , s...",1
1,across mail 12 re : punctuation . email punctu...,1
2,subject : begin begin begin groundfloor someth...,0
3,nice true democracy try reach agreement phonet...,1
4,read research literature slip tongue scan both...,1


In [3]:
final.shape

(962, 2)

In [4]:
i = 0
for sen in final['Emails'].values:
    if(len(re.findall('<.*?>', sen))):
        print(sen,"\n\n")
        i += 1
    if i == 5:
        break

indiana university linguistics club publications : two classics reissued phonology wilbur , ronnie . phonology reduplication . since appearance work 1973 , continual theoretical significance . wilbur document case under - over-application rule reduplicative form problem present rule order . foreshadow current work optimality theory reject rule order develop notion akin reduplicative base - reduplicant identity . work play important role rule order debate 1970 , development reduplication theory within prosodic morphology during 1980 , currently provide insight emerge correspondence theory . copy * limit * . special reissue price : $ 6 . 50 humor tiersma , peter m . language-based humor marx brothers films tiersma 's popular essay excellent introduction linguistic analysis humor . using lexical semantics pragmatic , sound , yet lively , analysis specific example . great resource introductory linguistics course , read . price : $ 4 . 0 iulc publication , 720 e . atwater ave . , bloomingto

In [5]:
stop = set(stopwords.words('english'))
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [6]:
sno = nltk.stem.SnowballStemmer('english')

In [7]:
def cleanpunc(sentence):
    '''This function cleans all the punctuation or special characters from a given sentence'''
    cleaned = re.sub(r'[?|@|!|^|%|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [11]:
def preprocessing(series):
    '''The function takes a Pandas Series object containing text in all the cells
       And performs following Preprocessing steps on each cell:
       1. Clean text from html tags
       2. Clean text from punctuations and special characters
       3. Retain only non-numeric Latin characters with lenght > 2
       4. Remove stopwords from the sentence
       5. Apply stemming to all the words in the sentence
       
       Return values:
       1. final_string - List of cleaned sentences
       2. list_of_sent - List of lists which can be used as input to the W2V model'''
    
    i = 0
    str1=" "
    final_string = []    ## This list will contain cleaned sentences
    list_of_sent = []    ## This is a list of lists used as input to the W2V model at a later stage
    
    ## Creating below lists for future use
    all_positive_words=[] # store words from hamp mails
    all_negative_words=[] # store words from spam mails here
    
    
    for sent in series.values:
        ## 
        filtered_sent = []
        list_of_sent = []
        sent = cleanpunc(sent)    ## Clean the punctuations and special characters
        ## Sentences are cleaned and words are handled individually
        for cleaned_words in sent.split():
            ## Only consider non-numeric words with length at least 3
            if((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                ## Only consider words which are not stopwords and convert them to lowet case
                if(cleaned_words.lower() not in stop):
                    ## Apply snowball stemmer and add them to the filtered_sent list
                    s = (sno.stem(cleaned_words.lower()))#.encode('utf-8')
                    filtered_sent.append(s)    ## This contains all the cleaned words for a sentence
                    if (final['lable'].values)[i] == 1:
                        all_positive_words.append(s) #list of all words used to describe ham mails
                    if(final['lable'].values)[i] == 0:
                        all_negative_words.append(s) #list of all words used to describe spam mails
        ## Below list is a list of lists used as input to W2V model later
        list_of_sent.append(filtered_sent)
        ## Join back all the words belonging to the same sentence
        str1 = " ".join(filtered_sent)
        ## Finally add the cleaned sentence in the below list
        final_string.append(str1)
        #print(i)
        i += 1
    return final_string, list_of_sent

In [9]:
for x in final['Emails'].iloc[:5].values:
    print(x,"\n\n")

slur . . . mean , sir : armey 's slip slip , slip sort represent compete plan , one ask , why compete plan ? phonological similarity ( " similarity " aristotean catch-all ) [ / fraenk / v . / faeg / ] something bernie baar 's " unintentional " pun , clearly indicate something bite raw phonological similarity . non-phonological , compete plan notion bring sort interpretive issue concern why armey something painfully abusive " mind " produce " fag . " " mind , " ( point during on-line speech ) anything typically volitional intentional . tricky issue .
 


across mail 12 re : punctuation . email punctuation discussion group . subscription address ( far remember ) : punct-l @ milwaukee . tec . wus . us hope . caroline ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ caroline ann leathem ~ ~ msc . speech language process ~ ~ edinburgh university ~ ~ ~ ~ email : cleathem @ ling . ed . ac . uk ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

In [12]:
final_string, list_of_sent = preprocessing(final['Emails'].iloc[:5])
for x in final_string:
    print(x,"\n\n")

slur mean sir armey slip slip slip sort repres compet plan one ask compet plan phonolog similar similar aristotean fraenk faeg someth berni baar unintent pun clear indic someth bite raw phonolog similar compet plan notion bring sort interpret issu concern armey someth pain abus mind produc fag mind point speech anyth typic volit intent tricki issu 


across mail punctuat email punctuat discuss group subscript address far rememb milwauke tec wus hope carolin carolin ann leathem msc speech languag process edinburgh univers email cleathem ling 


subject begin begin begin groundfloor someth unpreced miss profit microsoft stellar rise profit satellit explos profit internet phenomenon watch real opportun real opportun pass fortun hand anoth chanc one mother opportun invit listen learn product decad exclus right fortun toll free hour 


nice true democraci tri reach agreement phonet symbol stemburg suggest war import matter leav general choos standard set symbol leav phonetician intern phone

In [13]:
final_string, list_of_sent = preprocessing(final['Emails'])

In [14]:
len(final_string)

962

In [15]:
final['CleanedMails']=final_string
final.head()

Unnamed: 0,Emails,lable,CleanedMails
0,"slur . . . mean , sir : armey 's slip slip , s...",1,slur mean sir armey slip slip slip sort repres...
1,across mail 12 re : punctuation . email punctu...,1,across mail punctuat email punctuat discuss gr...
2,subject : begin begin begin groundfloor someth...,0,subject begin begin begin groundfloor someth u...
3,nice true democracy try reach agreement phonet...,1,nice true democraci tri reach agreement phonet...
4,read research literature slip tongue scan both...,1,read research literatur slip tongu scan normal...


In [16]:
conn = sqlite3.connect('Emails.sqlite')
c=conn.cursor()
final.to_sql('Emails', conn, if_exists='replace')
conn.close()