In [None]:
# import module
import re
import csv
import time
import pandas as pd
import matplotlib.pyplot as plt
import MeCab
import random
import numpy as np

# read tweets.csv (downloaded by twitter.com)
tw_df = pd.read_csv('./mytweetdata/tweets.csv', encoding='utf-8')

# read PN Table (downloaded by http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html)
pn_df = pd.read_csv('./pn_ja.dic.txt', sep=':', encoding='shift-jis', names=('Word','Reading','POS', 'PN'))

In [None]:
# remove new lines from body text
text_list = list(tw_df['text'])
for i in range(len(text_list)):
    text_list[i] = text_list[i].replace('\n', ' ')

In [None]:
# make MeCab instance
m = MeCab.Tagger('')

# -----A function that morphologically analyzes text and returns a list of dictionaries---- #
def get_diclist(text):
    parsed = m.parse(text)      # Morphological analysis result (obtained as a character string including newline)
    lines = parsed.split('\n')  # Divide the analysis results by line (one word) into a list
    lines = lines[0:-2]         # The last two lines are unnecessary so delete
    diclist = []
    for word in lines:
        l = re.split('\t|,',word)  # Each line is separated by a tab and a comma
        d = {'Surface':l[0], 'POS1':l[1], 'POS2':l[2], 'BaseForm':l[7]}
        diclist.append(d)
    return(diclist)

In [None]:
# Convert PN Table from data frame to dict type
word_list = list(pn_df['Word'])
pn_list = list(pn_df['PN'])  # the type is numpy.float64
pn_dict = dict(zip(word_list, pn_list))


# Function to add PN value to dict data for each word of morpheme analysis result
def add_pnvalue(diclist_old):
    diclist_new = []
    for word in diclist_old:
        base = word['BaseForm']        # Get basic form from individual dictionary
        if base in pn_dict:
            pn = float(pn_dict[base]) 
        else:
            pn = 'notfound'            # If the word is not in the PN Table
        word['PN'] = pn
        diclist_new.append(word)
    return(diclist_new)

In [None]:
# A function that takes PN average value of each tweet
def get_pnmean(diclist):
    pn_list = []
    for word in diclist:
        pn = word['PN']
        if pn != 'notfound':
            pn_list.append(pn)  # If it is notfound it will not add it            
    if len(pn_list) > 0:        
        pnmean = np.mean(pn_list)
    else:
        pnmean = 0              
    return(pnmean)

In [None]:
# Create a list of pn values
pnmeans_list = []
for tw in tw_df['text']:
    dl_old = get_diclist(tw)
    dl_new = add_pnvalue(dl_old)
    pnmean = get_pnmean(dl_new)
    pnmeans_list.append(pnmean)

In [None]:
# Create data frame storing tweet ID, body, PN value
aura_df = pd.DataFrame({'tweet_id':tw_df['tweet_id'],
                        'text':text_list,
                        'PN':pnmeans_list,
                       },
                       columns=['tweet_id', 'text', 'PN']
                      )

# Sort in ascending order of PN values
aura_df = aura_df.sort_values(by='PN', ascending=True)

# Output CSV (Shift-JIS should be specified instead of UTF 8 if you want to use Excel)
aura_df.to_csv('aura.csv',\
                index=None,\
                encoding='utf-8',\
                quoting=csv.QUOTE_NONNUMERIC\
               )

In [None]:
# read aura.csv
show_aura = pd.read_csv('./aura.csv', encoding='utf-8')

In [None]:
show_aura