# esa-summarize
The esa-summarize is created to understand the summarization effectively.

This file consists of the following contents:
- setup esa-summarize with dependencies
- preparation to retrieve esa posts
- retrieving posts
- parse text files of posts
- TF-IDF calculation

## Setup
Before open this file, setup is needed as below:

```
$ git clone https://github.com/ichiro-arai/esa-summarize.git
$ cd esa-clustering
$ brew install pyenv
$ echo 'export PYENV_ROOT="${HOME}/.pyenv"' >> ~/.bash_profile
$ echo 'export PATH="$PATH:${PYENV_ROOT}/bin"' >> ~/.bash_profile
$ echo 'eval "$(pyenv init -)"' >> ~/.bash_profile
$ source .bash_profile
$ pyenv install anaconda3-4.1.1
$ pyenv local anaconda3-4.1.1
$ brew install mecab
$ brew install mecab-ipadic
$ pip install mecab-python3
$ jupyter notebook
```
Then, open this ipynb file.

In [None]:
# TODO modify dictionary, append Wikipedia titles

## Preparation

In [None]:
import os
token = os.environ['ESA_ACCESS_TOKEN']  # access token which is generated by https://[team].esa.io/user/tokens
headers = {
    'Authorization': 'Bearer {0}'.format(token)
}
prefix = 'https://api.esa.io'  # see https://docs.esa.io/posts/102

team = os.environ['ESA_TEAM']

In [None]:
import json

def json_load(filename):
    with open(filename, 'r') as f:
        return json.load(f)

In [None]:
import requests
import os.path

def request_dump(request, filename):
    r = requests.get(prefix + request, headers=headers)
    dump(r, filename)

def request_dump_if_not_exist(request, filename):
    if (os.path.exists(filename)):
        return
    else:
        request_dump(request, filename)

def dump(res, filename):
    assert res.status_code == 200
    with open(filename, 'w') as f:
        f.write(res.text)

## Retrieve Posts

In [None]:
url = '/v1/teams/{0}/posts?include=stargazers,comments'.format(team)
request_dump_if_not_exist(url, 'posts.json')

In [None]:
### need this?:  get the list of post_number
# import time
# wait = (15 * 60) / 75  # api call limit of request is 75 times per 15 minutes
#
# def wait_retry(request_func, post_numbers):
#     post_numbers = post_numbers.copy
#     while (len(post_numbers) > 0):
#         try:
#             time.sleep(wait)
#             post_num = post_numbers.pop(0)
#             request_func(post_num)
#         except Exception:
#             print('failed: %d' % post_num)
#             post_numbers.append(post_num)
#
# def request_func(post_num):
#     url = '/v1/teams/{0}/posts/{1}?include=comments'.format(team, post_num)
#     filename = 'post-{0}.json'.format(post_num)
#     request_dump_if_not_exist(url, filename)
#
# posts_json = json_load('posts.json')
# post_numbers = [post['number'] for post in posts_json['posts']]
# wait_retry(request_func, post_numbers)

## Parse text files

In [None]:
posts = json_load('posts.json')['posts']

In [None]:
def filename(post):
    return 'parsed-post-{0}.csv'.format(post['number'])

In [None]:
import MeCab

def to_itr(res):
    while res:
        yield res
        res = res.next

mecab = MeCab.Tagger('-Ochasen')
mecab.parse('')  # magic not to be Garbage Collected

for post in posts:
    with open(filename(post), 'w') as f:
        parsed = mecab.parseToNode(post['body_md'])
        nouns = [i.surface for i in to_itr(parsed) if i.feature.split(',')[0] == '名詞']
        f.writelines(map(lambda x: x + '\n', nouns))

## TF-IDF for each user post

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
filenames = [filename(post) for post in posts]

In [None]:
tfidf_vectorizer = TfidfVectorizer(input='filename')
tfidf_matrix = tfidf_vectorizer.fit_transform(filenames)

In [None]:
terms = tfidf_vectorizer.get_feature_names()
tfidfs = tfidf_matrix.toarray()

In [None]:
def find_index(posts, pred):
    filtered = filter(pred, enumerate(posts))
    return map(lambda t: t[0], filtered)

def find_user_posts(posts, user):
    return find_index(posts, lambda post: post[1]['created_by']['screen_name'] == user)

In [None]:
all_users = set(map(lambda t: t['created_by']['screen_name'], posts))

n = 50
for user in all_users:
    print('-' * 10)
    print(user)
    for i in find_user_posts(posts, user):
        tfidf_array = tfidfs[i]
        top_n_idx = tfidf_array.argsort()[-n:][::-1]
        words = [terms[idx] for idx in top_n_idx]
        print('{0}. {1}: '.format(posts[i]['number'], posts[i]['name']))
        print(words)
        print()

## Text Summarization by each user

In [None]:
# TODO