# D7 Text Analysis: Step by Step

In [17]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import re
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [19]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/wah016/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/wah016/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### 1. Grab a corpus of documents (recipes)

In [33]:
import json, urllib.request

# Uses openrecipes
data = urllib.request.urlopen("https://openrecipes.s3.amazonaws.com/openrecipes.txt").read().decode("utf-8")
data_json = [json.loads(raw_text) for raw_text in data.split('\n')[:-1]]
documents = [f"{recipe['name']}\n{recipe['description']}\n{recipe['ingredients']}" for recipe in data_json]
recipe_names = [recipe['name'] for recipe in data_json]

In [35]:
type(documents)

list

In [21]:
documents[0]

"Easter Leftover Sandwich\nGot leftover Easter eggs?    Got leftover Easter ham?    Got a hearty appetite?    Good! You've come to the right place!    I...\n12 whole Hard Boiled Eggs\n1/2 cup Mayonnaise\n3 Tablespoons Grainy Dijon Mustard\n Salt And Pepper, to taste\n Several Dashes Worcestershire Sauce\n Leftover Baked Ham, Sliced\n Kaiser Rolls Or Other Bread\n Extra Mayonnaise And Dijon, For Spreading\n Swiss Cheese Or Other Cheese Slices\n Thinly Sliced Red Onion\n Avocado Slices\n Sliced Tomatoes\n Lettuce, Spinach, Or Arugula"

In [22]:
recipe_names[0]

'Easter Leftover Sandwich'

In [23]:
print(len(documents))

1042


### 2. Clean up the documents to just be the words (remove punctuation, etc.)

You don't really need to know Regex but this is basically getting anything that is *not* (`^`) lowercase letters (`a-z`) or numbers (`0-9`) and substituting it (`re.sub`) with a space (`' '`).

In [24]:
clean = [re.sub(r'[^A-Za-z0-9]+', ' ', doc.lower()) for doc in documents]
clean[2]

'herb roasted pork tenderloin with preserves this was yummy and easy and pretty and it took basically no time to make before i share the recipe i ll just say it 2 whole pork tenderloins salt and pepper to taste 8 tablespoons herbs de provence more if needed 1 cup preserves fig peach plum 1 cup water 1 tablespoon vinegar'

### 3. Make a `TfidfVectorizer`

In [25]:
tfidf = TfidfVectorizer(sublinear_tf=True, analyzer='word', tokenizer=word_tokenize)

### 4. Fit the `TfidfVectorizer` to your corpus!

In [26]:
fitted = tfidf.fit_transform(clean)

In [27]:
print(fitted.shape)
fitted.toarray()

(1042, 5392)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Looks like we've a big multidimensional array, one row for each document and one column for each word. Raw arrays are hard to work with. Let's make into a dataframe...

In [38]:
df = pd.DataFrame(fitted.toarray())

In [39]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5382,5383,5384,5385,5386,5387,5388,5389,5390,5391
0,0.0,0.0,0.0,0.0,0.026766,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.078998,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.070055,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.050195,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.025719,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,0.0,0.0,0.0,0.0,0.069045,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1038,0.0,0.0,0.0,0.0,0.069190,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1039,0.0,0.0,0.0,0.0,0.081907,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.110562,0.0,0.0,0.0,0.0,0.0,0.0
1040,0.0,0.0,0.0,0.0,0.048618,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


This is still hard to make sense of. Let's make it more interpretable by assigning the columns to be the actual words they correspond to, and row indexes to the name of the recipe...

In [40]:
df.columns = tfidf.get_feature_names()
df.index = recipe_names

In [41]:
df

Unnamed: 0,0,000,03,0z,1,10,100,100g,101,105,...,zannie,zarubin,zero,zest,zested,zesty,zings,ziti,zucchini,zuppa
Easter Leftover Sandwich,0.0,0.0,0.0,0.0,0.026766,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Pasta with Pesto Cream Sauce,0.0,0.0,0.0,0.0,0.078998,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Herb Roasted Pork Tenderloin with Preserves,0.0,0.0,0.0,0.0,0.070055,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Chicken Florentine Pasta,0.0,0.0,0.0,0.0,0.050195,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Perfect Iced Coffee,0.0,0.0,0.0,0.0,0.025719,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Golden Potstickers,0.0,0.0,0.0,0.0,0.069045,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Gougères,0.0,0.0,0.0,0.0,0.069190,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Parmesan Cheese Spread,0.0,0.0,0.0,0.0,0.081907,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.110562,0.0,0.0,0.0,0.0,0.0,0.0
Mast-o-Khiar Yogurt Dip,0.0,0.0,0.0,0.0,0.048618,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


Now we can get the most relevant word for each recipe...

In [42]:
df.idxmax(axis=1) 
# col-wise: for each row, find the column with the largest value

Easter Leftover Sandwich                            easter
Pasta with Pesto Cream Sauce                         basil
Herb Roasted Pork Tenderloin with Preserves      preserves
Chicken Florentine Pasta                         completed
Perfect Iced Coffee                                   iced
                                                  ...     
Golden Potstickers                             potstickers
Gougères                                              goug
Parmesan Cheese Spread                              spread
Mast-o-Khiar Yogurt Dip                              khiar
Fennel Mushrooms                                    fennel
Length: 1042, dtype: object

...and that mostly feels right! :)