In this notebook we will follow other strategies for feature engineering, trying to overcome or circunvent the computational cost of the strategies in the notebooks FeatureEngineering01, 02, 03.
First, we will try to generate a list of tokens with sentiments (filtered as FeatureEngineering01 until the step with the filter for vader sentiment, that is, excluding the calculation of our own sentiment score)

In [1]:
#load initial data
import pandas as pd
import numpy as np
df=pd.read_pickle('full_data.db')

#getting the object with the trained words 'NWORDS' that we saved in the previous step with 'dill'
import dill
with open('TrainedWordsForSpellCheck.pkl', 'rb') as f:
    NWORDS=dill.load(f)
    
import re, collections
alphabet = 'abcdefghijklmnopqrstuvwxyz'

def words(text):
    return re.findall('[a-z]+', text.lower())

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model
def edits1(word):
    s = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in s if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in s for c in alphabet if b]
    inserts    = [a + c + b     for a, b in s for c in alphabet]
    return set(deletes + transposes + replaces + inserts)
def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): 
    return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or    known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)

def correct_top(word, n):
    candidates = known([word]) or known(edits1(word)) or    known_edits2(word) or [word]
    s = sorted(candidates, key=NWORDS.get, reverse=True)
    return s[0], s[:n]

In [2]:
# building a tokenizer that picks out sequences of alphanumeric characters as tokens 
# and drops everything else 
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# here we use to test a string in which we repeat several times the words 'time' and 'latin'
dd="The hard hard hard harder hardest badly worse worst oldest best better loudly time classical Greek and writing had little or no space between words, and could be written in boustrophedon (alternating directions). Over time, text direction (left to right) became standardized, and word dividers and terminal punctuation became common. The first way to divide sentences into groups was the original paragraphos, similar to an underscore at the beginning of the new group.[3] The Greek paragraphos evolved into the pilcrow (¶), which in English manuscripts in the Middle Ages can be seen inserted inline between sentences. "
wordList=tokenizer.tokenize(dd)
#spell check
for w in wordList:
    wordList[wordList.index(w)]=correct(w)
#print wordList

#removing stopwords
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
wordsNoStop = [w for w in wordList if not w in stopwords.words('english')]

# lowercase + tokenize + spellcheck + stopwords
def tagList(textData):
    wordList=tokenizer.tokenize(textData.lower())
    for w in wordList:
        wordList[wordList.index(w)]=correct(w)
    wordsNoStop = [w for w in wordList if not w in stopwords.words('english')]
    tagged = pos_tag(wordsNoStop)
    return tagged
#tagged = tagList(dd)

def subjobj(taggedList):
#This funtion accepts a list of (word,tags) and  create two list:
#(a) the list with subjective content (that express sentiment)...(adjectives, verbs, adverbs)
#(b) the list with objective content (that do not express sentiment)...(nouns, pronouns)
#-------------------------------------------------------------------------------------------
    subjTags    = ['VB','VBP','VBZ','VBD','VBG','VBN','RB','RBR','RBS','JJ','JJR','JJS']
    objTags     = ['NN','NNS','NNP','NNPS']
    subjContent = []
    objContent  = []
    for t in taggedList:
        if not t[0] in subjContent:
            if t[1] in subjTags:
                subjContent.append(t[0])
        if not t[0] in objContent:
            if t[1] in objTags:
                objContent.append(t[0])
    return subjContent, objContent 
#subjectiveList,objectiveList = subjobj(tagged)
#print subjectiveList[:10]

def lemmatizeDeduplicate(wordList):
#This function lemmatizes the elements of the list of strings and eliminate the duplicates
#that might result from the lemmatization returning a clean lemmatized list.
#------------------------------------------------------------------------------------
    #lemmatizer
    import unicodedata
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmaTMP=[]
    for w in wordList:
        lemmaTMP.append(wordnet_lemmatizer.lemmatize(w))
    lemmatizeList=[x.encode('UTF8') for x in lemmaTMP]
    finalList=list(set(lemmatizeList)) #eliminate duplicates
    return finalList
#print lemmatizeDeduplicate(subjectiveList)[:10]


def cleanList(textData):
    wordList=tokenizer.tokenize(textData.lower())
    for w in wordList:
        wordList[wordList.index(w)]=correct(w)
    cleaned = [w for w in wordList if not w in stopwords.words('english')]
    return cleaned

from vader import *
def filterBySentiment(myList):
# this function implements a filter in which we keep only the words in the list that
#have some non-zero sentiment (vader scores) associated to them.
    sentList = []
    for x in myList:
        if sentiment(x)["compound"]<>0:
            sentList.append( (x,sentiment(x)["compound"]) )
    return sentList


In [3]:
import numpy as np
def getReducedDataSet(df,colName,nRows):
# This function created a reduced data frame to be use in training/validation/testing
# input :
# df        --- the full dataset
# colNamed  --- the name of the column (e.g., 'delivery_time','quality','service')
# nRows     --- the number of rows per each star ([1-6]) on the output dataframe
# this function does the following:
# (1) creates a randomization by rows of df
# (2) creates a 'TMP' dataframe with the relevant columns (colName + 'commentary')
# (3) sorts 'TMP' is ascending order using 'colName'
# (4) keeps the rows in which the stars <> 0
# (5) created 6 smaller dataframes (d1...d6) comntaining 'nRows' rows for each star rating
# (6) concatenates (d1...d6) into an output final dataframe
# We can use the function to create dataframes for training with different size (larger)
# then the dataframes for validation/testing. 
# As we are randomizing the initial dataframe at the beginning of the function we can use
# this function to 'bootstrap' the initial dataframe and create a large number
# of training/validation/testing sets.
#-------------------------------------------------------------------------------
    if colName == "delivery_time":
        colNumber = 3
    if colName == "quality":
        colNumber = 4    
    if colName == "service":
        colNumber = 5    
    TMP = df.iloc[np.random.permutation(len(df))] #randomize rows in the dataframe
    TMP = TMP.iloc[:,[colNumber,6]] #get relevant columns
    TMP = TMP.sort([colName], ascending=[1]) #sort data frame 
    TMP = TMP.loc[TMP[colName] <> 0] # take only ratings <> 0

    #getting 'nRows' for each star (1,2,3,4,5,6)
    d1 = TMP.loc[TMP[colName] == 1].iloc[range(nRows),:]
    d2 = TMP.loc[TMP[colName] == 2].iloc[range(nRows),:]
    d3 = TMP.loc[TMP[colName] == 3].iloc[range(nRows),:]
    d4 = TMP.loc[TMP[colName] == 4].iloc[range(nRows),:]
    d5 = TMP.loc[TMP[colName] == 5].iloc[range(nRows),:]
    d6 = TMP.loc[TMP[colName] == 6].iloc[range(nRows),:]
    frames = [d1, d2, d3, d4, d5, d6]
    return pd.concat(frames)
#example of use:
nRows=5 # This papameter will affect the size of the Feature Vector (FV) or dictionary that will be used 
        # in the Naive Bayes classifier
reducedDF = getReducedDataSet(df,'quality',nRows)



In [4]:
import time
TMP  = reducedDF['commentary'].tolist()
start_time = time.time()
TMP1 = [x.encode('UTF8') for x in TMP]
print("--- %s seconds ... encode to string...---" % (time.time() - start_time))

--- 0.0 seconds ... encode to string...---


In [5]:
start_time = time.time()
cleaned      = map(cleanList,TMP1)
print("--- %s seconds ... clean...---" % (time.time() - start_time))

--- 0.960999965668 seconds ... clean...---


In [6]:
start_time = time.time()
lemmatized   = map(lemmatizeDeduplicate,cleaned) 
print("--- %s seconds ... lemmatize...---" % (time.time() - start_time))

--- 2.95800018311 seconds ... lemmatize...---


In [7]:
#OPTIONAL: saving/loading the lemmatized data before flatten the list of strings
#import dill
#with open('lemmatizedBeforeFlatten01.pkl', 'wb') as f1:
#    dill.dump(lemmatized, f1)
#import dill
#with open('lemmatizedBeforeFlatten01.pkl', 'rb') as f:
#    lemmatized=dill.load(f)    

In [8]:
import time
start_time = time.time()
flatList     = reduce(lambda x,y:x+y, map(list, lemmatized))
print("--- %s seconds ... flat list...---" % (time.time() - start_time))

--- 0.0 seconds ... flat list...---


In [9]:
start_time = time.time()
deduplicated = list(set(flatList)) #eliminate duplicates
print("--- %s seconds ... deduplicate...---" % (time.time() - start_time))

--- 0.0 seconds ... deduplicate...---


In [10]:
start_time = time.time()
tagged     = pos_tag(deduplicated)
print("--- %s seconds ... tag...---" % (time.time() - start_time))

--- 1.39999985695 seconds ... tag...---


In [11]:
start_time = time.time()
SubjectiveList, ObjectiveList = subjobj(tagged)
print("--- %s seconds ... tag...---" % (time.time() - start_time))

--- 0.0 seconds ... tag...---


In [12]:
start_time = time.time()
vaderFiltered = filterBySentiment(SubjectiveList)
print("--- %s seconds ... vader filter...---" % (time.time() - start_time))
print "--------"
print len(SubjectiveList)
print len(vaderFiltered)

--- 3.46700000763 seconds ... vader filter...---
--------
136
29


In [13]:
#saving the final list results (tagged words, subjective tokens, objective tokens, tokens with vader sentiment)
import dill
with open('jeTokensWithVaderSentiments03.pkl', 'wb') as f1:
    dill.dump(vaderFiltered, f1)
with open('jeSubjectiveTokens03.pkl', 'wb') as f2:
    dill.dump(SubjectiveList, f2)
with open('jeObjectiveTokens03.pkl', 'wb') as f3:
    dill.dump(ObjectiveList, f3)
with open('jeTaggedTokens03.pkl', 'wb') as f4:
    dill.dump(tagged, f4)    
print "done!!"    

done!!
