### CS102: Unstructured Data - Text

In [1]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('tagsets')
from collections import Counter

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chokyungjin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chokyungjin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/chokyungjin/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/chokyungjin/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/chokyungjin/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [2]:
# For compatibility across multiple platforms
import os
IB = os.environ.get('INSTABASE_URI',None) is not None
open = ib.open if IB else open

### Dataset of wine descriptions

*Read wines into dataframe, show sample*

In [5]:
f = open('Wines200.csv','r')
wines = pd.read_csv(f)
print (len(wines), 'wines')
wines.head(5)

200 wines


Unnamed: 0,country,winery,variety,points,price,description
0,US,Heitz,Cabernet Sauvignon,96,235.0,This tremendous 100% varietal wine hails from ...
1,Spain,Bodega Carmen Rodr_guez,Tinta de Toro,96,110.0,"Ripe aromas of fig, blackberry and cassis are ..."
2,US,Macauley,Sauvignon Blanc,96,90.0,Mac Watson honors the memory of a wine once ma...
3,US,Ponzi,Pinot Noir,96,65.0,"This spent 20 months in 30% new French oak, an..."
4,France,Domaine de la Bgude,Provence red blend,95,66.0,"This is the top wine from La Bgude, named afte..."


*Show sample descriptions*

In [6]:
text = wines.loc[1].description
print (text, '\n')
text = wines.loc[3].description
print (text)

Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023. 

This spent 20 months in 30% new French oak, and incorporates fruit from Ponzi's Aurora, Abetina and Madrona vineyards, among others. Aromatic, dense and toasty, it deftly blends aromas and flavors of toast, cigar box, blackberry, black cherry, coffee and graphite. Tannins are polished to a fine sheen, and frame a finish loaded with dark chocolate and espresso. Drink now through 2032.


### Search: string-contains and regular expressions

*Find wines with description containing 'chocolate'*

In [7]:
wines[wines.description.str.contains('chocolate')][['country', 'variety', 'description']]

Unnamed: 0,country,variety,description
1,Spain,Tinta de Toro,"Ripe aromas of fig, blackberry and cassis are ..."
3,US,Pinot Noir,"This spent 20 months in 30% new French oak, an..."
6,Spain,Tinta de Toro,Slightly gritty black-fruit aromas include a s...
12,US,Pinot Noir,A standout even in this terrific lineup of 201...
14,US,Pinot Noir,"With its sophisticated mix of mineral, acid an..."
16,US,Cabernet Sauvignon,"This blockbuster, powerhouse of a wine suggest..."
22,Spain,Tinta de Toro,Tarry blackberry and cheesy oak aromas are app...
27,US,Pinot Noir,"Focused and dense, this intense wine captures ..."
44,France,Syrah,"Dark in color and in flavor profile, this medi..."
69,US,Cabernet Sauvignon,A juiciness of cherry and vanilla spark the op...


*Find wines with description containing 'chocolate' and 'fruit'*

In [None]:
wines[wines.description.str.contains('chocolate') & wines.description.str.contains('fruit')]\
[['country', 'variety','description']]

*Find wines with description where 'chocolate' precedes 'fruit', then reverse*

In [5]:
for i in range(len(wines)):
    text = wines.loc[i].description
    s = re.search('chocolate(.*)fruit', text)
    if s:
        print wines.loc[i].country, wines.loc[i].variety, '-', text, '\n'

Spain Tinta de Toro - Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023. 

US Pinot Noir - A standout even in this terrific lineup of 2015 releases from Patricia Green, the Weber opens with a burst of cola and tobacco scents and accents. It continues, subtle and detailed, with flavors of oranges, vanilla, tea and milk chocolate discreetly threaded through ripe blackberry fruit. 

France Syrah - Dark in color and in flavor profile, this medium-bodied Cornas boasts aromas and flavors reminiscent of chocolate fudge, espresso and plummy fruit, all framed by soft, dusty tannins. Drink now_2025. 



*Find wines to be drunk between now and 2020 or later*

In [6]:
# Find wines to be drunk through 2020 or later
for i in range(len(wines)):
    text = wines.loc[i].description
    s = re.search('Drink (.*) through 20(2|3).', text)
    if s:
        print wines.loc[i].variety, '-', text[s.start():s.end()]

Pinot Noir - Drink now through 2032
Tinta de Toro - Drink this exemplary Toro through 2023
Tinta de Toro - Drink now through 2024
Tinta de Toro - Drink this saturated black-colored Toro through 2023
Pinot Noir - Drink now through 2030
Pinot Noir - Drink now through 2028
Pinot Noir - Drink now through 2028
Chardonnay - Drink now through 2028
Tempranillo Blend - Drink this special Rioja from 2020 through 2035
Pinot Noir - Drink now through 2020
Cabernet Sauvignon - Drink now through 2022
Meritage - Drink now through 2020
Chardonnay - Drink now through 2021


### Language processing: tokenizing, removing punctuation, parts of speech

*Process one description; first separate into list of tokens*

In [7]:
text = wines.loc[1].description
tokens = nltk.wordpunct_tokenize(text)
print tokens

['Ripe', 'aromas', 'of', 'fig', ',', 'blackberry', 'and', 'cassis', 'are', 'softened', 'and', 'sweetened', 'by', 'a', 'slathering', 'of', 'oaky', 'chocolate', 'and', 'vanilla', '.', 'This', 'is', 'full', ',', 'layered', ',', 'intense', 'and', 'cushioned', 'on', 'the', 'palate', ',', 'with', 'rich', 'flavors', 'of', 'chocolaty', 'black', 'fruits', 'and', 'baking', 'spices', '.', 'A', 'toasty', ',', 'everlasting', 'finish', 'is', 'heady', 'but', 'ideally', 'balanced', '.', 'Drink', 'through', '2023', '.']


*Remove puncutation*

In [8]:
punct = list(string.punctuation)
print punct
tokens_nopunct = []
for word in tokens:
    if word not in punct:
        tokens_nopunct.append(word)
print tokens_nopunct

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
['Ripe', 'aromas', 'of', 'fig', 'blackberry', 'and', 'cassis', 'are', 'softened', 'and', 'sweetened', 'by', 'a', 'slathering', 'of', 'oaky', 'chocolate', 'and', 'vanilla', 'This', 'is', 'full', 'layered', 'intense', 'and', 'cushioned', 'on', 'the', 'palate', 'with', 'rich', 'flavors', 'of', 'chocolaty', 'black', 'fruits', 'and', 'baking', 'spices', 'A', 'toasty', 'everlasting', 'finish', 'is', 'heady', 'but', 'ideally', 'balanced', 'Drink', 'through', '2023']


In [9]:
# more compact code for same thing
punct = list(string.punctuation)
tokens_nopunct = [word for word in tokens if word not in punct]
print tokens_nopunct

['Ripe', 'aromas', 'of', 'fig', 'blackberry', 'and', 'cassis', 'are', 'softened', 'and', 'sweetened', 'by', 'a', 'slathering', 'of', 'oaky', 'chocolate', 'and', 'vanilla', 'This', 'is', 'full', 'layered', 'intense', 'and', 'cushioned', 'on', 'the', 'palate', 'with', 'rich', 'flavors', 'of', 'chocolaty', 'black', 'fruits', 'and', 'baking', 'spices', 'A', 'toasty', 'everlasting', 'finish', 'is', 'heady', 'but', 'ideally', 'balanced', 'Drink', 'through', '2023']


*Tag with parts of speech*

In [10]:
tagged = nltk.pos_tag(tokens_nopunct)
print tagged

[('Ripe', 'NNP'), ('aromas', 'NN'), ('of', 'IN'), ('fig', 'NN'), ('blackberry', 'NN'), ('and', 'CC'), ('cassis', 'NN'), ('are', 'VBP'), ('softened', 'VBN'), ('and', 'CC'), ('sweetened', 'VBN'), ('by', 'IN'), ('a', 'DT'), ('slathering', 'NN'), ('of', 'IN'), ('oaky', 'JJ'), ('chocolate', 'NN'), ('and', 'CC'), ('vanilla', 'NN'), ('This', 'DT'), ('is', 'VBZ'), ('full', 'JJ'), ('layered', 'VBN'), ('intense', 'JJ'), ('and', 'CC'), ('cushioned', 'VBN'), ('on', 'IN'), ('the', 'DT'), ('palate', 'NN'), ('with', 'IN'), ('rich', 'JJ'), ('flavors', 'NNS'), ('of', 'IN'), ('chocolaty', 'NN'), ('black', 'JJ'), ('fruits', 'NNS'), ('and', 'CC'), ('baking', 'VBG'), ('spices', 'NNS'), ('A', 'NNP'), ('toasty', 'NN'), ('everlasting', 'VBG'), ('finish', 'NN'), ('is', 'VBZ'), ('heady', 'JJ'), ('but', 'CC'), ('ideally', 'RB'), ('balanced', 'VBD'), ('Drink', 'NNP'), ('through', 'IN'), ('2023', 'CD')]


*Demystify tags*

In [11]:
done = []
for word in tagged:
    if word[1] not in done:
        done.append(word[1])
        nltk.help.upenn_tagset(word[1])

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever

### Entire corpus as list of words

In [12]:
punct = list(string.punctuation)
allwords = []
for i in range(len(wines)):
    text = wines.loc[i].description
    tokens = nltk.wordpunct_tokenize(text)
    tokens = [word.lower() for word in tokens if word not in punct]
    allwords = allwords + tokens
# print allwords

*Most common words in corpus*

In [13]:
counts = Counter(allwords)
# print counts
counts.most_common(20)

[('and', 551),
 ('the', 336),
 ('of', 283),
 ('a', 270),
 ('this', 207),
 ('with', 160),
 ('is', 135),
 ('wine', 128),
 ('it', 122),
 ('in', 111),
 ('flavors', 104),
 ('on', 83),
 ('palate', 83),
 ('aromas', 79),
 ('to', 78),
 ('tannins', 71),
 ('fruit', 70),
 ('cherry', 66),
 ('from', 65),
 ('drink', 63)]

*Now without stopwords*

In [8]:
stop = stopwords.words('english')
# print stop
allwords_nostops = [word for word in allwords if word not in stop]
counts = Counter(allwords_nostops)
counts.most_common(20)

NameError: name 'allwords' is not defined

#### Bigrams and n-grams

*Bigrams and 4-grams on one description*

In [14]:
# Recreate list of tokens from one description
text = wines.loc[1].description
print text, '\n'
tokens = nltk.wordpunct_tokenize(text)
punct = list(string.punctuation)
tokens = [word for word in tokens if word not in punct]
# Find bigrams
bg = nltk.bigrams(tokens)
for b in bg: print b
# Change to ngrams(tokens, 4)

Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023. 

('Ripe', 'aromas')
('aromas', 'of')
('of', 'fig')
('fig', 'blackberry')
('blackberry', 'and')
('and', 'cassis')
('cassis', 'are')
('are', 'softened')
('softened', 'and')
('and', 'sweetened')
('sweetened', 'by')
('by', 'a')
('a', 'slathering')
('slathering', 'of')
('of', 'oaky')
('oaky', 'chocolate')
('chocolate', 'and')
('and', 'vanilla')
('vanilla', 'This')
('This', 'is')
('is', 'full')
('full', 'layered')
('layered', 'intense')
('intense', 'and')
('and', 'cushioned')
('cushioned', 'on')
('on', 'the')
('the', 'palate')
('palate', 'with')
('with', 'rich')
('rich', 'flavors')
('flavors', 'of')
('of', 'chocolaty')
('chocolaty', 'black')
('black', 'fruits')
('fruits', 'and'

*Most common word triples in corpus*

In [15]:
grams = nltk.ngrams(allwords, 3)
counts = Counter(grams)
counts.most_common(20)
# try longer n-grams
# change to allwords_nostops

[(('on', 'the', 'palate'), 23),
 (('this', 'is', 'a'), 17),
 (('on', 'the', 'nose'), 14),
 (('a', 'hint', 'of'), 12),
 (('drink', 'now', 'through'), 11),
 (('on', 'the', 'finish'), 10),
 (('a', 'whiff', 'of'), 8),
 (('and', 'a', 'hint'), 8),
 (('a', 'blend', 'of'), 7),
 (('this', 'is', 'an'), 7),
 (('this', 'medium', 'bodied'), 7),
 (('the', 'palate', 'with'), 7),
 (('and', 'a', 'whiff'), 7),
 (('fine', 'grained', 'tannins'), 7),
 (('this', 'wine', 'is'), 7),
 (('full', 'bodied', 'wine'), 6),
 (('medium', 'bodied', 'wine'), 6),
 (('lead', 'the', 'nose'), 6),
 (('drink', 'through', '2020'), 6),
 (('in', 'the', 'glass'), 6)]

*On entire corpus find all pairs of words that follow 'citrus'*

In [None]:
grams = nltk.ngrams(allwords, 3)
for g in grams:
    if g[0] == 'citrus': print g
# change to allwords_nostops
# change to pairs of words around citrus (g[1]), back to allwords

In [None]:
# same functionality without n-grams
for i in range(len(allwords)-2):
    if allwords[i] == 'citrus':
        print allwords[i], allwords[i+1], allwords[i+2]