In [126]:
import pandas as pd
import numpy as np
import os
import sys
import re
import string
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [135]:
# 1.1 Cleanup
def cleaned_1(text):
    #text = re.sub("s+"," ", text) #Removing extra spaces
    #text = re.sub("[^-9A-Za-z ]", "" , text) #Punctuations can be removed by using regular expressions.
    text = "".join([i.lower() for i in text if i not in string.punctuation])
    text = re.sub('[-+]?[0-9]+', '', text) #Remove numbers
    text = re.sub('\[[^]]*\]', '', text) #Remove between square brackets
    return text

def remove_stopwords(text): # 1.2
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    # Stem here only # 1.3
    stemmed = [porter.stem(word) for word in words]

    # Lemmatize # 1.4
    lemmatized = []
    for word in stemmed:
        lemmatized.append(lemmatizer.lemmatize(word))
    return ' '.join(lemmatized)

In [136]:
file_text_dict = {}
def create_df(root):
    file_names = os.listdir(root)
    # Create Dictionary for File Name and Text
    file_name_and_text = {}
    for file in file_names:
        with open(root + file, "r") as target_file:
            file_name_and_text[file] = remove_stopwords(cleaned_1(target_file.read()))
    file_data = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
                .reset_index().rename(index = str, columns = {'index': 'file_name', 0: 'text'}))
    return file_data           
                

In [137]:
root = "./op_spam_training_data/"
outer_tags = ['negative_polarity','positive_polarity']
inner_tags= ['deceptive_from_MTurk', 'truthful_from_TripAdvisor']

df_res_list = []
tag = 0 
for outer in outer_tags:
    for inner in inner_tags:
        if not os.path.exists(root + outer + "/" + inner):
            inner = 'truthful_from_Web'

        for i in range(1,5):
            loc = root + outer + "/" + inner + "/" + "fold" + str(i) + "/"
            df_res = create_df(loc)
            df_res["category"] = tag
            df_res_list.append(df_res)
        tag = tag + 1     

df = pd.concat(df_res_list)            

In [139]:
df.head(600)

Unnamed: 0,file_name,text,category
0,d_sofitel_12.txt,husband stay sofitel chicago water tower three...,0
1,d_sofitel_13.txt,stay sofitel one le pleasur experi chicago upo...,0
2,d_sofitel_11.txt,stay sofitel husband weekend never stay staff ...,0
3,d_sofitel_10.txt,stay sofitel chicago water tower hotel coupl w...,0
4,d_sofitel_14.txt,arriv sofitel chicago water tower hotel greet ...,0
...,...,...,...
35,t_sheraton_18.txt,gener speak noth bad place would clean issu ch...,1
36,t_homewood_6.txt,plan stay night famili trip book hotel expect ...,1
37,t_swissotel_18.txt,stay valentin weekend got th floor direct view...,1
38,t_homewood_7.txt,read good review book night stay check late bi...,1


In [4]:
from nltk.corpus import stopwords
stopword = stopwords.words('english')
for word in stopword:
    print(word)

i
me
my
myself
we
our
ours
ourselves
you
you're
you've
you'll
you'd
your
yours
yourself
yourselves
he
him
his
himself
she
she's
her
hers
herself
it
it's
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
that'll
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
don't
should
should've
now
d
ll
m
o
re
ve
y
ain
aren
aren't
couldn
couldn't
didn
didn't
doesn
doesn't
hadn
hadn't
hasn
hasn't
haven
haven't
isn
isn't
ma
mightn
mightn't
mustn
mustn't
needn
needn't
shan
shan't
shouldn
shouldn't
wasn
wasn't
weren
weren't
won
won't
wouldn
wouldn't
