In [34]:
#Author: Caleb Woy
!python --version
import sklearn as sk
import nltk
import pandas as pd
import math

Python 3.8.2


# Preprocessing
### Loading data into pandas

In [35]:
pd.options.display.max_rows = 2000
word_data = pd.read_excel('TextMiningWords1.xlsx')
word_data = word_data.drop(columns=['Index'])
word_data['Word'] = word_data['Word'].astype(str)
word_data.head()

Unnamed: 0,Document,Word
0,Potato,the
1,Potato,potato
2,Potato,is
3,Potato,a
4,Potato,root


### Cleaning

In [36]:
# removing sweet potatoes document
word_data_no_sp = word_data[word_data.Document != 'SweetPotato']
print(len(word_data_no_sp))

def Clean(data):
    from nltk.corpus import stopwords
    from nltk.stem.wordnet import WordNetLemmatizer
    # removing stopwords
    stopwords = set(stopwords.words('english'))
    data = data[~data['Word'].isin(stopwords)]
    # removing rows where Word contains a number
    data = data[~data['Word'].str.contains(r'\d')]
    # removing possesive nouns
    data = data[~data['Word'].str.contains(r".'s$")]
    # Removing countries
    country_list = set(['united', 'states'
                       , 'mexico', 'us', 'americas', 'peru',
                      'chile', 'bolivia', 'andes', 'europe', 'asia',
                       'china', 'india','kingdom', 'world', "world's"])
    data = data[~data['Word'].isin(country_list)]
    #replacing maize with corn
    data = data.replace({'maize':'corn'})
    # transforming plural to singlular
    # standardizing tense
    data['Word'] = data['Word'].apply(lambda x: WordNetLemmatizer().lemmatize(x))
    return data

1466


In [37]:
word_data_no_sp = Clean(word_data_no_sp)
print(len(word_data_no_sp))

804


### Explaination and observations

I removed the default list of stopwords from nltk so that we can get more accurate similrity measures.
I removed numbers because I don't think they're important given the context of our text.
I removed possexive nouns because thaty're usually countries or people. Not super important for our context.
I removed countries because they're not important for our context. I left in tokens like 'Spanish' or 'American' because they're specifically descriptive.
I replaced maize with corn because only corn.txt called it maize.
I used the wordnet lemmatizer to standardize tense and plurality. It replaces words with their root meaning.

In total this cleaning process reduced our data by 662 words. Preprocessing is important for emphasizing a context of focus. It'll make it easier to compare our documents in the way that we really care about.

# Calculate Term Frequency Weights

In [38]:
# Calculating weights with double normalization
counts = word_data_no_sp.groupby(['Document', 'Word']).size()
docs = word_data_no_sp['Document'].unique().tolist()
tfweights = {x:{} for x in docs}
for doc in docs:
    max_in_doc = max(counts[doc])
    for word, count in counts[doc].iteritems():
        tfweights[doc][word] = 0.5 + (0.5 * count / max_in_doc)

In [39]:
import collections
for doc in docs:
    print(f'Document: {doc}')
    top_words = collections.Counter(tfweights[doc])
    for key, value in top_words.most_common(10):
        print(f'     Word: {key}, \t\tWeight: {value}')

Document: Potato
     Word: potato, 		Weight: 1.0
     Word: food, 		Weight: 0.6428571428571428
     Word: specie, 		Weight: 0.6428571428571428
     Word: human, 		Weight: 0.6071428571428571
     Word: part, 		Weight: 0.6071428571428571
     Word: plant, 		Weight: 0.6071428571428571
     Word: production, 		Weight: 0.6071428571428571
     Word: solanum, 		Weight: 0.6071428571428571
     Word: southern, 		Weight: 0.6071428571428571
     Word: tuber, 		Weight: 0.6071428571428571
Document: PotatoChip
     Word: potato, 		Weight: 1.0
     Word: chip, 		Weight: 0.8888888888888888
     Word: recipe, 		Weight: 0.7222222222222222
     Word: book, 		Weight: 0.6666666666666666
     Word: fried, 		Weight: 0.6666666666666666
     Word: market, 		Weight: 0.6666666666666666
     Word: slice, 		Weight: 0.6666666666666666
     Word: snack, 		Weight: 0.6666666666666666
     Word: billion, 		Weight: 0.6111111111111112
     Word: british, 		Weight: 0.6111111111111112
Document: Tomato
     Word: tomato, 	

### Explaination and Observations

I calculated the raw term counts with pandas built in groupby function and size function.
I calculated the weights with double normalization because I noticed the documents vary in length.

A lot of the weights are the same are the top 4-5. This is due to a lot of the less frequent words only being mentioned once.

The tfweights will be important for calculating similarity later.

# Calculate Inverse Document Frequency Weights

In [40]:
n = len(docs)
wrds = word_data_no_sp['Word'].unique().tolist()
idfweights = {}
for wrd in wrds:
    k = 0
    for doc in docs:
        if wrd in tfweights[doc]:
            k += 1
    idfweights[wrd] = math.log(n / k, 10)

In [41]:
top_words = collections.Counter(idfweights)
for key, value in reversed(top_words.most_common()[-30:]):
    print(f'     Word: {key}, \t\tWeight: {value}')

     Word: food, 		Weight: 0.0
     Word: ingredient, 		Weight: 0.0791812460476248
     Word: known, 		Weight: 0.17609125905568124
     Word: may, 		Weight: 0.17609125905568124
     Word: corn, 		Weight: 0.17609125905568124
     Word: part, 		Weight: 0.17609125905568124
     Word: many, 		Weight: 0.17609125905568124
     Word: year, 		Weight: 0.17609125905568124
     Word: variety, 		Weight: 0.17609125905568124
     Word: also, 		Weight: 0.30102999566398114
     Word: usually, 		Weight: 0.30102999566398114
     Word: oil, 		Weight: 0.30102999566398114
     Word: made, 		Weight: 0.30102999566398114
     Word: widely, 		Weight: 0.30102999566398114
     Word: first, 		Weight: 0.30102999566398114
     Word: salt, 		Weight: 0.30102999566398114
     Word: flavor, 		Weight: 0.30102999566398114
     Word: including, 		Weight: 0.30102999566398114
     Word: dish, 		Weight: 0.30102999566398114
     Word: snack, 		Weight: 0.30102999566398114
     Word: baked, 		Weight: 0.30102999566398114
     Wo

### Explaination and Observations


I calculated the IDF weights utilizing the speed of dictionary indexing to check if the tfweight dict contained a specific word. If the dict contained a word then a counter was increased. I passed the result of that counter to the IDF function as k. 

A lot of the top words don't have to do with food however most are still useful. The only ones which probably aren't are 'known', 'may', 'part', 'many', 'also', 'usually', 'including', and 'often'. These could be added to our collection of stopwords if necessary. 

Inverse document frequencies are important to compare with frequencies within documents. If a majority of occurences across all documents are concentrated in one document that gives us a clue about the significance of that word. 

# Calculate TF-IDF Weights

In [42]:
def tfxidf(tf, idf):
    for word in idf:
        for doc in tf:
            if word in tf[doc]:
                tf[doc][word] = tf[doc][word] * idf[word]
    return tf

In [43]:
tfidf = tfxidf(tfweights, idfweights)

In [44]:
# displaying top 10 words per document
for doc in tfidf:
    print(f'Document: {doc}')
    top_words = collections.Counter(tfidf[doc])
    for key, value in top_words.most_common(10):
        print(f'     Word: {key}, \t\tWeight: {value}')

Document: Potato
     Word: potato, 		Weight: 0.47712125471966244
     Word: tuber, 		Weight: 0.4724489734472121
     Word: eastern, 		Weight: 0.44465785736208197
     Word: enough, 		Weight: 0.44465785736208197
     Word: glycoalkaloids, 		Weight: 0.44465785736208197
     Word: health, 		Weight: 0.44465785736208197
     Word: region, 		Weight: 0.44465785736208197
     Word: still, 		Weight: 0.44465785736208197
     Word: wild, 		Weight: 0.44465785736208197
     Word: accumulate, 		Weight: 0.4168667412769519
Document: PotatoChip
     Word: recipe, 		Weight: 0.5619981252770758
     Word: book, 		Weight: 0.5187675002557623
     Word: market, 		Weight: 0.5187675002557623
     Word: slice, 		Weight: 0.5187675002557623
     Word: potato, 		Weight: 0.47712125471966244
     Word: british, 		Weight: 0.4755368752344489
     Word: call, 		Weight: 0.4755368752344489
     Word: dripping, 		Weight: 0.4755368752344489
     Word: large, 		Weight: 0.4755368752344489
     Word: peel, 		Weight: 0.475536

In [45]:
# displaying top 30 words overall by sum
word_sums = {word: 0 for word in idfweights}
for word in word_sums:
    for doc in tfidf:
        if word in tfidf[doc]:
            word_sums[word] += tfidf[doc][word]
top_words = collections.Counter(word_sums)
for key, value in top_words.most_common(30):
    print(f'     Word: {key}, \t\tWeight: {value}')

     Word: potato, 		Weight: 0.9542425094393249
     Word: chip, 		Weight: 0.7692988778079518
     Word: tomato, 		Weight: 0.7327219268909102
     Word: tortilla, 		Weight: 0.6912141254272033
     Word: corn, 		Weight: 0.6221246130373793
     Word: made, 		Weight: 0.592643155566166
     Word: used, 		Weight: 0.5664110895314849
     Word: plant, 		Weight: 0.5650763061463875
     Word: fried, 		Weight: 0.5634664021402724
     Word: specie, 		Weight: 0.562321478776745
     Word: recipe, 		Weight: 0.5619981252770758
     Word: human, 		Weight: 0.5568686644370917
     Word: total, 		Weight: 0.5492195776550781
     Word: solanum, 		Weight: 0.5452814339653285
     Word: similar, 		Weight: 0.5433880956529489
     Word: cut, 		Weight: 0.5433880956529489
     Word: well, 		Weight: 0.5433880956529489
     Word: billion, 		Weight: 0.5396771525606849
     Word: snack, 		Weight: 0.5383805691682739
     Word: southern, 		Weight: 0.5377838142483052
     Word: commonly, 		Weight: 0.5377080807158101
   

In [46]:
# displaying top 30 words overall by mult
word_maxs = {word: 0 for word in idfweights}
for word in word_maxs:
    for doc in tfidf:
        if word in tfidf[doc]:
            if tfidf[doc][word] > word_maxs[word]:
                word_maxs[word] = tfidf[doc][word]
top_words = collections.Counter(word_mults)
for key, value in top_words.most_common(30):
    print(f'     Word: {key}, \t\tWeight: {value}')

     Word: recipe, 		Weight: 0.5619981252770758
     Word: slice, 		Weight: 0.5187675002557623
     Word: market, 		Weight: 0.5187675002557623
     Word: book, 		Weight: 0.5187675002557623
     Word: aztec, 		Weight: 0.5002400895323422
     Word: british, 		Weight: 0.4755368752344489
     Word: thin, 		Weight: 0.4755368752344489
     Word: large, 		Weight: 0.4755368752344489
     Word: shaving, 		Weight: 0.4755368752344489
     Word: peel, 		Weight: 0.4755368752344489
     Word: round, 		Weight: 0.4755368752344489
     Word: dripping, 		Weight: 0.4755368752344489
     Word: call, 		Weight: 0.4755368752344489
     Word: tuber, 		Weight: 0.4724489734472121
     Word: word, 		Weight: 0.4724489734472121
     Word: triangle, 		Weight: 0.4539215627237921
     Word: alternatively, 		Weight: 0.4539215627237921
     Word: disc, 		Weight: 0.4539215627237921
     Word: pressed, 		Weight: 0.4539215627237921
     Word: water, 		Weight: 0.4539215627237921
     Word: although, 		Weight: 0.45392156272

In [47]:
vocab = set(['corn', 'potato', 'chip', 'tomato', 'tortilla', 'made',
           'aztec', 'plant', 'fried', 'recipe', 'american', 'spanish',
           'glycoalkaloids', 'tuber', 'berry', 'grew', 'british', 'annual',
           'blue', 'angeles', 'cuisine', 'chili', 'popular', 'wild',
           'production', 'animal', 'peel', 'eastern', 'ethanol', 'masa'])

### Explaination and observations

I calculated the tfidf weights by simulating matrix multiplication using my dictionaries. I listed the top 10 weights for each document using the Counter object. I displayed the top 30 overall terms by both the highest sum tfidf weights and the highest max tfidf weights. I chose words for the vocabulary by picking the words that seemed the most relevant with regard to context from the top 10 terms per document. I then picked a couple more from the top overall displayed by SUM words.

The top overall terms by sum and by max tfidf are very different lists. The sum method likely better emphasizes top overall words spread across documents. The max method probably emphasizes the top overall terms that are unique to only few documents. I chose to use some words from the sum method list because I thought it would add some bias to the vocabulary. 

Selcted vocabularies are important for measuring simularity with a focus as we'll see in the next stage.

# Create Document Similarity Model and Matrix

In [48]:
vocab_weights = {doc: {} for doc in tfidf}
for word in vocab:
    for doc in tfidf:
        if word in tfidf[doc]:
            vocab_weights[doc][word] = tfidf[doc][word]
        else:
            vocab_weights[doc][word] = 0
            
doc_sims = {doc: {} for doc in tfidf}
for doc in doc_sims:
    doc_sims[doc] = {doc2: 0 for doc2 in tfidf if doc2 != doc}
    
for doc1 in doc_sims:
    for doc2 in doc_sims[doc1]:
        sim = 0
        for word in vocab:
            sim += vocab_weights[doc1][word] * vocab_weights[doc2][word]
        doc_sims[doc1][doc2] = sim
        print(f'{doc1}x{doc2} similarity: {sim}')
    print()

PotatoxPotatoChip similarity: 0.227644691705265
PotatoxTomato similarity: 0.22216517098618632
PotatoxCornChip similarity: 0.017037434898799473
PotatoxCorn similarity: 0.10128264961738599
PotatoxTortillaChip similarity: 0.01771893229475145

PotatoChipxPotato similarity: 0.227644691705265
PotatoChipxTomato similarity: 0.0
PotatoChipxCornChip similarity: 0.11308019239539016
PotatoChipxCorn similarity: 0.0
PotatoChipxTortillaChip similarity: 0.09397531970758452

TomatoxPotato similarity: 0.22216517098618632
TomatoxPotatoChip similarity: 0.0
TomatoxCornChip similarity: 0.0
TomatoxCorn similarity: 0.09016596299800922
TomatoxTortillaChip similarity: 0.0

CornChipxPotato similarity: 0.017037434898799473
CornChipxPotatoChip similarity: 0.11308019239539016
CornChipxTomato similarity: 0.0
CornChipxCorn similarity: 0.05881360972552516
CornChipxTortillaChip similarity: 0.2859703144439937

CornxPotato similarity: 0.10128264961738599
CornxPotatoChip similarity: 0.0
CornxTomato similarity: 0.090165962

### Hypothesized catergories: Chips, Corn-based

**Corn-based**: Corn, CornChip, TortillaChip<br/>
**Non-corn-based**: Potato, Tomato, PotatoChip<br/>

### Analysis

PotatoxPotatoChip similarity: 0.227644691705265<br/>
PotatoxTomato similarity: 0.22216517098618632<br/>
PotatoxCornChip similarity: 0.017037434898799473<br/>
PotatoxCorn similarity: 0.10128264961738599<br/>
PotatoxTortillaChip similarity: 0.01771893229475145<br/>

The similarity measures between Potato and the other documents support our proposed categories. Potato is most similar to Tomato and Potatochip and least similar to the corn based documents.

PotatoChipxPotato similarity: 0.227644691705265<br/>
PotatoChipxTomato similarity: 0.0<br/>
PotatoChipxCornChip similarity: 0.11308019239539016<br/>
PotatoChipxCorn similarity: 0.0<br/>
PotatoChipxTortillaChip similarity: 0.09397531970758452<br/>

PotatoChip is most similar to Potato, CornChip, and TortillaChip. Perhaps there should be a 'Chip vs Crop' dimension to this too.

TomatoxPotato similarity: 0.22216517098618632<br/>
TomatoxPotatoChip similarity: 0.0<br/>
TomatoxCornChip similarity: 0.0<br/>
TomatoxCorn similarity: 0.09016596299800922<br/>
TomatoxTortillaChip similarity: 0.0<br/>

Tomato is most similar to Potato and Corn. Both crops. 0 similarity to the chips.

CornChipxPotato similarity: 0.017037434898799473<br/>
CornChipxPotatoChip similarity: 0.11308019239539016<br/>
CornChipxTomato similarity: 0.0<br/>
CornChipxCorn similarity: 0.05881360972552516<br/>
CornChipxTortillaChip similarity: 0.2859703144439937<br/>

CornChip is most similar to TortillaChip, PotatoChip and Corn. Again, seeing evidence of the chip dimension here.

CornxPotato similarity: 0.10128264961738599<br/>
CornxPotatoChip similarity: 0.0<br/>
CornxTomato similarity: 0.09016596299800922<br/>
CornxCornChip similarity: 0.05881360972552516<br/>
CornxTortillaChip similarity: 0.13932861325850993<br/>

Corn is most similar to TortillaChip. Corn is actually more similar to Potato and Tomato than to CornChip. The Chip vs Crop dimension has shown up more in our similarity measure than our proposed hypotheses.

TortillaChipxPotato similarity: 0.01771893229475145<br/>
TortillaChipxPotatoChip similarity: 0.09397531970758452<br/>
TortillaChipxTomato similarity: 0.0<br/>
TortillaChipxCornChip similarity: 0.2859703144439937<br/>
TortillaChipxCorn similarity: 0.13932861325850993<br/>

TortillaChip is most similar to CornChip and Corn. This is fitting with our proposed categories.

### Explaination and Observations

I got the simularity calculations by simulating all pairwise dot products between the documents using my dictionaries.

I think the term 'tortilla' is what made Corn and TortillaChip have higher similarity than CornChip and Corn. I would have expected the opposite. My proposed categories held up pretty well but I think a second dimension of Crop vs Chip would also be necessary with this vocabulary. 

The simularity matrix calcuation is important for mining clusters/categories of documents. 

# Takeaways

This process seems similar to ANOVA-like processes that focus on between vs within group variation except in the document mining process we're using likeklihood of word observation rather than variance. I think selecting the vocabulary to be used in the simularity calculation is very important for teasing out the proper result. It likely causes a lot of variation, we could test this by comparing against a randomly selected vocabulary. If I were to redo the project I'd try using another simularity calculation vs just the dot product to seew how it changes my result.