In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functools import partial
from difflib import SequenceMatcher

In [2]:
#리뷰 데이터
review_df = pd.read_csv("../data/reviews.csv", delimiter="▒",engine='python')
#item 데이터
item_df = pd.read_csv("../data/item.csv", delimiter="▒",engine='python')
#category 데이터
cat_df = pd.read_csv("../data/category.csv", delimiter="▒",engine='python')
#Brand 데이터
brand_df = pd.read_csv("../data/brand.csv", delimiter="▒",engine='python')

## 전처리

In [3]:
def normalize_int(num):
    try:
        return int(num)
    except:
        return 0
    
def normalize_price(price):
    if isinstance(price,int):   
        return price    
    try:
        price = price.replace(",","").strip()
    except AttributeError as e:
        print("price : ",price,e)
    if price[-1] == '원':
        return int(price[:-1])
    else:
        try:
            return int(price)
        except ValueError as e:
            print(e)

### category dataframe 전처리

In [4]:
cat_df[['cat1_id','cat2_id','cat3_id','cat4_id']] =\
cat_df[['cat1_id','cat2_id','cat3_id','cat4_id']].applymap(normalize_int)

cat_df[['cat1_title','cat2_title','cat3_title','cat4_title']] =\
cat_df[['cat1_title','cat2_title','cat3_title','cat4_title']].applymap(
lambda x : "" if pd.isnull(x) else x)

### item dataframe 전처리

In [5]:
item_df[['min_price','max_price']] =\
item_df[['min_price','max_price']].applymap(normalize_price)

item_df['reg_date'] = item_df.reg_date.apply(lambda x : x[:-1] if x[-1]=='.' else x)
item_df['reg_date'] = pd.to_datetime(item_df.reg_date, format="%Y.%m")

In [6]:
def memoize(func):
    cache = {}
    def memoizer(*args, **kwargs):
        key = str(args) + str(kwargs)
        if key not in cache:
            cache[key] = func(*args, **kwargs)
        return cache[key]
    return memoizer

In [7]:
def split_category(index, categories):
    try:
        cat_name = categories.split('>')[index]
        return cat_name
    except IndexError as e:
        return ""

In [12]:
@memoize
def find_similar(match_list, input_str):
    series = pd.Series()
    for target_str in match_list:
        series[target_str] = SequenceMatcher(None, input_str, target_str).ratio()
    return series.sort_values(ascending=False).index[0]

In [9]:
# 이름을 쪼개서 나누어 넣음
for i in range(1,5):
    item_df['cat{}_title'.format(i)] = item_df.raw_category.apply(partial(split_category, i-1))

In [13]:
# 이름이 따로 없는 경우에는 가장 가까운 단어를 찾아서 넣어줌
for i in range(1,5):
    cat_title = "cat{}_title".format(i)
    cat_list = list(cat_df[cat_title].unique())
    not_in_category = ~item_df[cat_title].isin(cat_df[cat_title].unique())
    if not_in_category.sum() > 0:
        item_df.loc[not_in_category,cat_title] =\
        item_df.loc[not_in_category,cat_title].apply(partial(find_similar,cat_list))

In [14]:
#check category name
for i in range(1,5):
    unique_set = set(item_df['cat{}_title'.format(i)]) - set(cat_df['cat{}_title'.format(i)])
    assert len(unique_set) == 0,\
    "[category level:{}] not Exist in category dataFrame---{}".format(i, unique_set)

### review dataframe 전처리

In [None]:
review_df['review_date'] = pd.to_datetime(review_df['review_date'],format=('%Y.%m.%d.'))
review_df['review_grade'] = review_df.review_grade.astype(int)
review_df['item_id'] = review_df.item_id.astype(int)

### Data Merging 전처리

In [None]:
merged_df = pd.merge(review_df, item_df, how='inner', left_on='item_id', right_index=True)

In [None]:
merged_df.to_csv("../data/merge.csv",sep="▒",index=False)

In [None]:
from konlpy.tag import Twitter
import nltk

In [None]:
twitter = Twitter()

In [None]:
target_sentence = review_df.iloc[0]['review_atc']

In [None]:
# parse phrase to porphemes
print(twitter.morphs(target_sentence))

In [None]:
# Noun extractor
print(twitter.nouns(target_sentence))

In [None]:
# Phrase extractor
print(twitter.phrases(target_sentence))

In [None]:
# POS tagger
print(twitter.pos(target_sentence,norm=True))
print('\n--------------------------------------------------------------\n')
print(twitter.pos(target_sentence,norm=True,stem=True))

norm - 정규화(normalization)
    
    한국어를 처리하는 예시입니닼ㅋㅋ -> 한국어를 처리하는 예시입니다ㅋㅋ

stem - 어근화(stemming)

    한국어를 처리하는 예시입니다 ㅋㅋ -> 한국어Noun, 를Josa, 처리Noun, 하다verb, 예시Noun, 이다Adjective, ㅋㅋKorean

### 구문 분석

reference : http://konlpy.org/ko/v0.4.3/examples/chunking/

In [None]:
words = twitter.pos(target_sentence,norm=True,stem=True)

In [None]:
# Define a chunk grammar, or chunking rules, then chunk
grammar = """
NP: {<N.*>*<Suffix>?}   # Noun phrase
VP: {<V.*>*}            # Verb phrase
AP: {<A.*>*}            # Adjective phrase
"""
parser = nltk.RegexpParser(grammar)
chunks = parser.parse(words)
print("# Print whole tree")
print(chunks.pprint())

print("\n# Print noun phrases only")
for subtree in chunks.subtrees():
    if subtree.label()=='NP':
        print(' '.join((e[0] for e in list(subtree))))
        print(subtree.pprint())

# Display the chunk tree
chunks.draw()