In [6]:
import pandas as pd
from utility.processing import processer
import nltk

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 100)

# preprocessing

### data load

In [7]:
train = pd.read_csv('./data/train.csv').fillna('')
test = pd.read_csv('./data/test.csv').fillna('')
df_apply_func = processer.df_apply_func

In [8]:
train.iloc[:,:-1].head()

Unnamed: 0,id,query,product_title,product_description,median_relevance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2


In [9]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471


### 대/소문자 통합 LED, led, PS2, ps2, levis, Levis, etc...

In [10]:
train = df_apply_func(train, processer.string_lower, first_run=True)
test = df_apply_func(test, processer.string_lower, first_run=True)

TypeError: df_apply_func() missing 1 required positional argument: 'func'

In [None]:
train.head()

### 여러가지 pattern 제거

In [None]:
train = df_apply_func(train, processer.remove_pattern)
test = df_apply_func(test, processer.remove_pattern)

In [None]:
train.head()

### 단어 양끝 특수기호(구두점) 제거

In [None]:
train = df_apply_func(train, processer.punct)
test = df_apply_func(test, processer.punct)

In [None]:
train.head()

### Tokenization

In [None]:
train = df_apply_func(train, processer.tokenizer)
test = df_apply_func(test, processer.tokenizer)

In [None]:
train.head()

#### 어간, 표제어 추출전 불용어 제거

In [None]:
train = df_apply_func(train, processer.remove_sw)
test = df_apply_func(test, processer.remove_sw)

### 어간추출

In [None]:
train = df_apply_func(train, processer.P_stemmer)
test = df_apply_func(test, processer.P_stemmer)

In [None]:
train.head()

### 표제어 추출(Lemmatization)

In [None]:
train = df_apply_func(train, processer.lemmatizer)
test = df_apply_func(test, processer.lemmatizer)

In [None]:
train.head()

### 어간, 표제어 추출후 불용어 제거

In [None]:
train = df_apply_func(train, processer.remove_sw)
test = df_apply_func(test, processer.remove_sw)

In [None]:
train.head()

### 전처리된 데이터 저장

In [None]:
# train.to_csv('./data/preprocessed_train.csv', index=False)
# test.to_csv('./data/preprocessed_test.csv', index=False)

In [None]:
from scipy.stats import truncnorm
def get_truncated_normal_sample(mean=2, sd=0):
    if sd == 0:
        sample = np.array([mean]*3)
    else:
        sample = np.round(truncnorm((1 - mean) / sd, (4 - mean) / sd, loc=mean, scale=sd).rvs(3))
        
    if np.median(sample) != mean:
        sample = get_truncated_normal_sample(mean, sd)
    
    return sample

In [None]:
train = train[['query_preprocessed', 'product_title_preprocessed', 'product_description_preprocessed', 'median_relevance', 'relevance_variance']]
test = test[['id', 'query_preprocessed', 'product_title_preprocessed', 'product_description_preprocessed']]

In [None]:
train['sampling'] = train.apply(lambda x: get_truncated_normal_sample(x['median_relevance'], x['relevance_variance']), axis=1)
train = train[['query_preprocessed','product_title_preprocessed','sampling']]

In [None]:
train = pd.DataFrame({'query_preprocessed' : np.repeat(train['query_preprocessed'],3),\
                      'product_title_preprocessed' : np.repeat(train['product_title_preprocessed'],3),\
                      'median_relevance':np.concatenate(train['sampling'].values)})

In [None]:
train.to_csv('./data/preprocessed_train1.csv', index=False)
test.to_csv('./data/preprocessed_test1.csv', index=False)