In [1]:
import os
import pandas as pd 
from ReviewSentence import ReviewSentence

In [2]:
dfs = []
versions = ['five', 'three']
for version in versions:
    source_dir = os.path.join('raw', version)
    for filename in os.listdir(source_dir):
        if filename == "Readme.txt":
            continue

        rows = []
        filepath = os.path.join(source_dir, filename)
        print("Processing %s.." % filepath)

        with open(filepath, encoding="utf-8") as lines:
            for ln in lines:
                line = ln.strip().replace("\t", " ")
                r = ReviewSentence.parse(line)
                if r is not None:
                    if r.sentence_type == 'review':
                        rows.append(r.to_row())
        df = pd.DataFrame.from_records(rows, columns=("content", "raw_targets"))
        df['filename'] = filename.replace('.txt', '')
        dfs.append(df)
concat_df = pd.concat(dfs, ignore_index=True)

before_ = len(concat_df)
concat_df = concat_df.drop_duplicates(subset=['content'])
print('# of duplicates:', before_ - len(concat_df))

Processing raw/five/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt..
Processing raw/five/Apex AD2600 Progressive-scan DVD player.txt..
Processing raw/five/Nikon coolpix 4300.txt..
Processing raw/five/Nokia 6610.txt..
Processing raw/five/Canon G3.txt..
Processing raw/three/Speaker.xml..
Processing raw/three/Router.xml..
Processing raw/three/Router.txt..
Processing raw/three/Speaker.txt..
Processing raw/three/Computer.xml..
Processing raw/three/Computer.txt..
# of duplicates: 49


In [3]:
def domain_by_filename(filename):
    if filename == 'Creative Labs Nomad Jukebox Zen Xtra 40GB': return 'MP3 player'
    if filename == 'Apex AD2600 Progressive-scan DVD player': return 'DVD player'
    if filename == 'Nikon coolpix 4300': return 'Digital camera2'
    if filename == 'Nokia 6610': return 'Cell phone'
    if filename == 'Canon G3': return 'Digital camera1'
    if filename == 'Speaker': return 'Speaker'
    if filename == 'Router': return 'Wireless router'
    if filename == 'Computer': return 'Computer'
concat_df['domain'] = concat_df.apply(lambda x: domain_by_filename(x['filename']), axis=1)

In [4]:
concat_df.head()

Unnamed: 0,content,raw_targets,filename,domain
0,"this is an edited review , now that i have had...",[],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player
1,"while , there are flaws with the machine , the...",[affordability],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player
2,it is the most bang-for-the-buck out there .,[bang-for-the-buck],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player
3,"like it 's predecessor , the quickly revised n...","[size, weight, navigational system, sound]",Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player
4,the xtra improves upon the zen nx with a large...,[screen],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player


In [5]:
filepath = os.path.join('parsed', '%s_%d.json' % ('-'.join(versions), len(concat_df)))
concat_df.to_json(filepath)
print('Created: %s' % filepath)

Created: parsed/five-three_5995.json


In [6]:
filepath = os.path.join('parsed', '%s_%d.csv' % ('-'.join(versions), len(concat_df)))
concat_df.to_csv(filepath)
print('Created: %s' % filepath)

Created: parsed/five-three_5995.csv
