# MPES <span style='font-size:20px'>: Majority Parentheses Expression by Sister pages

## Import Packages and initialization

In [None]:
import json
from collections import defaultdict, Counter
from nltk import word_tokenize
from math import log, sqrt
import numpy as np

## load page_category
`Format example`
```python
page_category = {
    'Bass (fish)' : ['Category:Fish common names'],
    'Star (automobile)' : ['Category:Defunct motor vehicle manufacturers of the United States', 'Category:Durant Motors'],
    ...
}

```

In [None]:
with open('page-to-cats.json') as f:
    page_category = json.load(f)

print(page_category['Bass (fish)'])

# Step 1. Count the classes of categories
For each category, count the appearance of words(possible classes) in the title of the pages under it
- you may use **`page_to_class()`**

example:
```python
category_class_count['Category:Fish common names']['fish'] = 16
```

In [None]:
# count the appearance of possible classes in the page title
def page_to_class(page):
    class_list = []
    if '(' in page and 'disambiguation' not in page:
        class_list.append(page.split('(')[1][:-1])
    return class_list


    print(page_to_class('Bass (fish)'))

In [None]:
category_class_count = defaultdict(lambda: defaultdict(lambda: 0))

for page, category_list in page_category.items():
    for page_class in page_to_class(page):
        for category in category_list:
            ##### YOUR CODE HERE #####
            
            
category_class_count['Category:Fish common names']['fish']

## load sentences

In [None]:
with open('sentences.json') as f:
    sentences = json.load(f)
    
    
sentences['bass']['Bass (fish)'][:5]

# 2. Decide class of wikipedia pages

## For each page in **sentences.json**, sum up the count of classes of all the categories it belongs.

example:
```python
page_class_count['Bass (fish)'] = 
    {
        ...,
        'mackerel': 2,
        'bass': 2,
        'fish': 16,
        'hake': 1,
        'sea': 1,
        ...
    }
```

In [None]:
page_class_count = defaultdict(lambda: defaultdict(lambda: 0))

for page_list in sentences.values():
    for page in page_list:
        for category in page_category[page]:
            for c, count in category_class_count[category].items():
                ##### YOUR CODE HERE #####

                
page_class_count['Bass (fish)']['fish']

## Then, for each page, choose the class with the maximum count.

example:
```python
page_class['Bass (fish)'] = 'fish'
```

In [None]:
page_class = defaultdict(dict)

for page, class_counts in page_class_count.items():
    ##### YOUR CODE HERE #####
    

page_class['Bass (fish)']

# 3. Combine sentences which have the same class 

- construct __class_sents__ using __page_class__ & __sentences__ dictionaries

`Format`
```python
class_sents = {
    word: {
        Class: {
            [ sent_1, sent_2, ... ]
        }
    }
}


```
`Examples`

```python
1.
list(class_sents.keys())
['star',
 'mole',
 'galley',
 'cone',
 'bass',
 'bow',
 'taste',
 'interest',
 'issue',
 'duty',
 'sentence',
 'slug']

2. 
list(class_sents['bass'].keys())
['fish', 'music', 'sound']

3.
len(class_sents['bass']['fish']) = 668

```

In [None]:
class_sents = defaultdict(lambda: defaultdict(list))

for word, v in sentences.items():
    for page, sents in v.items():
        ##### YOUR CODE HERE #####


print(list(class_sents['bass'].keys()))
print(len(class_sents['bass']['fish']))

# 4. Classification

## Basic Method - Yarowsky92

$ weight_{ijk} = log(\frac{Pr(w_i|Class_j)}{Pr(w_i|word_k)})$
<br><br>
where<br> 
$Pr(w_i|Class_j) = \frac{Counts\;of\;w_i\;in\;Class_j}{Counts\;of\;all\;words\;in\;Class_j}$<br><br>
$Pr(w_i|word_k) = \frac{Counts\;of\;w_i\;in\;word_k}{Counts\;of\;all\;words\;in\;word_k}$
<br><br>


`Instruction`

Step 1. For each $word_k$, tokenize every sentences and count all words in $word_k$ as N<br>
Step 2. For each $class_j$, count all words in $class_j$ as n<br>
Step 3. For each word $w_i$, count occurances in $class_j$ and $word_k$, repectively.<br>
Step 4. Calculate $weight_{ijk}$


`Example`
```python
weight['bass']['fish']['freshwater'] = 3.0273651127101293
max(weight['bass']['fish'].items(), key = lambda x: x[1]) = ('bream', 3.1972901141524415)
```

In [None]:
weight = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))

for word, v in class_sents.items():
    total_words = {}
    for wiki_class, sents in v.items():
        total_words[wiki_class] = Counter([word.lower() for sent in sents for word in word_tokenize(sent)])
    N = ? ##### Finish it #####
    for wiki_class, sents in v.items():
        n = ? ##### Finish it #####
        for tks in total_words[wiki_class]:
            ##### YOUR CODE HERE #####

            
print(weight['bass']['fish']['freshwater'])
print(max(weight['bass']['fish'].items(), key = lambda x: x[1]))

## Classify sentence with max weight 

$ score = \sum\limits_{w\in context}weight_w $

`Example`
```python
most_similar('bass', 'Bass, catfish, and bluegill also inhabit the creek.') 
    = ("fish", 14.814981643783032)

most_similar('issue', 'He\'s contributed to several publications, including LA Review of Books, Purple, Issue, and Hesperios Journal.') 
    = ("magazine", 5.123843862532843)

most_similar('sentence', 'If it finds the accused guilty, it passes sentence on the accused according to law.') 
    = ("law", 5.77941354006321)
```

In [None]:
def most_similar(word, query):
    max_class, max_value = ('', np.NINF)
    q_tokens = word_tokenize(query)
    ##### YOUR CODE HERE #####
    
    
    return (max_class, max_value)

In [None]:
test_data = [['bass', 'fish', 'Bass, catfish, and bluegill also inhabit the creek.'],
             ['issue', 'magazine', 'He\'s contributed to several publications, including LA Review of Books, Purple, Issue, and Hesperios Journal.'],
             ['sentence', 'law', 'If it finds the accused guilty, it passes sentence on the accused according to law.']]

# Pass the test to get 100 points
for (word, wiki_class, query) in test_data:
    print('Predict class: %s (%s)'%most_similar(word, query))
    print('Correct class: %s\n'%wiki_class)

# Bouns - Classification 2

## Advanced Method - tf-idf weights


`term frequency`<br><br>$f_{i,j}$
<br>

`log normalization term frequency weight`<br><br>
$ 1 + log_2 f_{i,j} $,
<br>

where $f_{i,j}$ is times term i occur in document j, <br>
(One document is one class in there)
<br><br>
`inverse document frequency`<br><br>
$ log_2( \frac{N}{n_i}) $,

where $N$ is number of documents, <br>
and $n_i$ is times of term i occur in documents
<br><br>
`Examples`
```Python
term_doc_tf['star']['film']['role'] = 2.584962500721156

term_idf['star']['role'] = 1.5849625007211563


<br><br>
1. calculate log normalization term frequency weight (term_doc_tf)

In [None]:
term_doc_tf = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))

for word, v in class_sents.items():
    for classes, sents in v.items():
        tokens = Counter([tks for sent in list(map(word_tokenize, list(map(str.lower, sents)))) for tks in sent])
        for tok in tokens:
            if tok not in term_doc_tf[word][classes]:
                ##### YOUR CODE HERE #####


print(term_doc_tf['star']['film']['role'])

2. calculate inverse document frequency (term_idf)

In [None]:
term_idf = defaultdict(lambda:defaultdict(lambda: 0.0))

for word, v in class_sents.items():
    sents_tokens = [Counter([sub_s for s in list(map(word_tokenize,list(map(str.lower,sent)))) for sub_s in s]) for sent in v.values()]
    N = len(sents_tokens)
    for sent in sents_tokens:
        for tok in sent:
            ##### YOUR CODE HERE #####


print(term_idf['star']['role'])

## Term frequency of the query and similarity

`Term frequency`<br><br>
$ 1 + log_2 f_{i, q} $

<br>

`Similarity`<br><br>
$ sim(d_j, q) = \frac{\Sigma^t_{i=1} w_{i,j} \times w_{i,q}}{ \sqrt{\Sigma^t_{i=1} w_{i,j}^2} \times \sqrt{\Sigma^t_{i=1} w_{i,q}^2}} $

where $w_{i,j} = (1 + log_2 f_{i,j}) \times log_2( \frac{N}{n_i})$

### Find most similar class

In [None]:
def query_term_freq(query):
    query_tf = defaultdict(lambda: 0.0)
    tokens = word_tokenize(query.lower())
    for tok in tokens:
        if tok.lower() not in query_tf:
            query_tf[tok] = (1 + log(tokens.count(tok), 2))
    return query_tf

In [None]:
def find_most_similar(word, query):
    query_tf = query_term_freq(query)
    max_class, max_value = ('', np.NINF)
    for candid, terms in term_doc_tf[word].items():
        ##### YOUR CODE HERE #####
        

    return (max_class, max_value)

In [None]:
# please pass the test
for (word, classes, query) in test_data:
    print('Predict class: %s (%s)'%find_most_similar(word, query))
    print('Correct class: %s\n'%classes)