In [1]:
import pandas as pd
from utility.processing import processer
import nltk

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 100)

# preprocessing

### data load

In [2]:
train = pd.read_csv('./data/train.csv').fillna('')
test = pd.read_csv('./data/test.csv').fillna('')
preprocessing = processer()

In [3]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471


### 대/소문자 통합 LED, led, PS2, ps2, levis, Levis, etc...

In [4]:
train = preprocessing.df_apply_func(train, preprocessing.string_lower, first_run=True)
test = preprocessing.df_apply_func(test, preprocessing.string_lower, first_run=True)

In [5]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,query_preprocessed,product_title_preprocessed,product_description_preprocessed
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0,bridal shower decorations,accent pillow with heart design - red/black,"red satin accent pillow embroidered with a heart in black thread. 8"" x 8""."
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0,led christmas lights,set of 10 battery operated multi led train christmas lights - clear wire,set of 10 battery operated train christmas lights item #x124210 features: color: multi-color bul...
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471,projector,viewsonic pro8200 dlp multimedia projector,
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0,wine rack,"concept housewares wr-44526 solid-wood ceiling/wall-mount wine rack, charcoal grey, 6 bottle","like a silent and sturdy tree, the southern enterprises bird and branch coat rack is an eye-catc..."
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471,light bulb,wintergreen lighting christmas led light bulb (pack of 25),"wtgr1011\nfeatures\nnickel base, 60,000 average hours, acrylic resin bulb material\nchristmas li..."


### 여러가지 pattern 제거

In [6]:
train = preprocessing.df_apply_func(train, preprocessing.remove_pattern)
test = preprocessing.df_apply_func(test, preprocessing.remove_pattern)

In [7]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,query_preprocessed,product_title_preprocessed,product_description_preprocessed
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0,bridal shower decorations,accent pillow with heart design - red/black,"red satin accent pillow embroidered with a heart in black thread. 8"" x 8""."
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0,led christmas lights,set of 10 battery operated multi led train christmas lights - clear wire,set of 10 battery operated train christmas lights item #x124210 features: color: multi-color bul...
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471,projector,viewsonic pro8200 dlp multimedia projector,
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0,wine rack,"concept housewares wr-44526 solid-wood ceiling/wall-mount wine rack, charcoal grey, 6 bottle","like a silent and sturdy tree, the southern enterprises bird and branch coat rack is an eye-catc..."
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471,light bulb,wintergreen lighting christmas led light bulb (pack of 25),"wtgr1011\nfeatures\nnickel base, 60,000 average hours, acrylic resin bulb material\nchristmas li..."


### 단어 양끝 특수기호(구두점) 제거

In [8]:
train = preprocessing.df_apply_func(train, preprocessing.punct)
test = preprocessing.df_apply_func(test, preprocessing.punct)

In [9]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,query_preprocessed,product_title_preprocessed,product_description_preprocessed
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0,bridal shower decorations,accent pillow with heart design red black,red satin accent pillow embroidered with a heart in black thread 8 x 8
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0,led christmas lights,set of 10 battery operated multi led train christmas lights clear wire,set of 10 battery operated train christmas lights item x124210 features color multi color bul...
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471,projector,viewsonic pro8200 dlp multimedia projector,
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0,wine rack,concept housewares wr 44526 solid wood ceiling wall mount wine rack charcoal grey 6 bottle,like a silent and sturdy tree the southern enterprises bird and branch coat rack is an eye catc...
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471,light bulb,wintergreen lighting christmas led light bulb pack of 25,wtgr1011\nfeatures\nnickel base 60 000 average hours acrylic resin bulb material\nchristmas li...


### Tokenization

In [10]:
train = preprocessing.df_apply_func(train, preprocessing.tokenizer)
test = preprocessing.df_apply_func(test, preprocessing.tokenizer)

In [11]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,query_preprocessed,product_title_preprocessed,product_description_preprocessed
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0,bridal shower decorations,accent pillow with heart design red black,red satin accent pillow embroidered with a heart in black thread 8 x 8
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0,led christmas lights,set of 10 battery operated multi led train christmas lights clear wire,set of 10 battery operated train christmas lights item x124210 features color multi color bulbs ...
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471,projector,viewsonic pro8200 dlp multimedia projector,
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0,wine rack,concept housewares wr 44526 solid wood ceiling wall mount wine rack charcoal grey 6 bottle,like a silent and sturdy tree the southern enterprises bird and branch coat rack is an eye catch...
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471,light bulb,wintergreen lighting christmas led light bulb pack of 25,wtgr1011 features nickel base 60 000 average hours acrylic resin bulb material christmas light b...


#### 어간, 표제어 추출전 불용어 제거

In [12]:
train = preprocessing.df_apply_func(train, preprocessing.remove_sw)
test = preprocessing.df_apply_func(test, preprocessing.remove_sw)

### 어간추출

In [13]:
train = preprocessing.df_apply_func(train, preprocessing.P_stemmer)
test = preprocessing.df_apply_func(test, preprocessing.P_stemmer)

In [14]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,query_preprocessed,product_title_preprocessed,product_description_preprocessed
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0,bridal shower decor,accent pillow heart design red black,red satin accent pillow embroid heart black thread 8 x 8
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0,led christma light,set 10 batteri oper multi led train christma light clear wire,set 10 batteri oper train christma light item x124210 featur color multi color bulb match train ...
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471,projector,viewson pro8200 dlp multimedia projector,
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0,wine rack,concept housewar wr 44526 solid wood ceil wall mount wine rack charcoal grey 6 bottl,like silent sturdi tree southern enterpris bird branch coat rack eye catch addit home décor tree...
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471,light bulb,wintergreen light christma led light bulb pack 25,wtgr1011 featur nickel base 60 000 averag hour acryl resin bulb materi christma light bulb stead...


### 표제어 추출(Lemmatization)

In [15]:
train = preprocessing.df_apply_func(train, preprocessing.lemmatizer)
test = preprocessing.df_apply_func(test, preprocessing.lemmatizer)

In [16]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,query_preprocessed,product_title_preprocessed,product_description_preprocessed
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0,bridal shower decor,accent pillow heart design red black,red satin accent pillow embroid heart black thread 8 x 8
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0,led christma light,set 10 batteri oper multi led train christma light clear wire,set 10 batteri oper train christma light item x124210 featur color multi color bulb match train ...
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471,projector,viewson pro8200 dlp multimedia projector,
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0,wine rack,concept housewar wr 44526 solid wood ceil wall mount wine rack charcoal grey 6 bottl,like silent sturdi tree southern enterpris bird branch coat rack eye catch addit home décor tree...
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471,light bulb,wintergreen light christma led light bulb pack 25,wtgr1011 featur nickel base 60 000 averag hour acryl resin bulb materi christma light bulb stead...


### 어간, 표제어 추출후 불용어 제거

In [17]:
train = preprocessing.df_apply_func(train, preprocessing.remove_sw)
test = preprocessing.df_apply_func(test, preprocessing.remove_sw)

In [18]:
train.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,query_preprocessed,product_title_preprocessed,product_description_preprocessed
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,"Red satin accent pillow embroidered with a heart in black thread. 8"" x 8"".",1,0.0,bridal shower decor,accent pillow heart design red black,red satin accent pillow embroid heart black thread 8 x 8
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire,Set of 10 Battery Operated Train Christmas Lights Item #X124210 Features: Color: multi-color bul...,4,0.0,led christma light,set 10 batteri oper multi led train christma light clear wire,set 10 batteri oper train christma light item x124210 featur color multi color bulb match train ...
2,4,projector,ViewSonic Pro8200 DLP Multimedia Projector,,4,0.471,projector,viewson pro8200 dlp multimedia projector,
3,5,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle","Like a silent and sturdy tree, the Southern Enterprises Bird and Branch Coat Rack is an eye-catc...",4,0.0,wine rack,concept housewar wr 44526 solid wood ceil wall mount wine rack charcoal grey 6 bottl,like silent sturdi tree southern enterpris bird branch coat rack eye catch addit home décor tree...
4,7,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25),"WTGR1011\nFeatures\nNickel base, 60,000 average hours, acrylic resin bulb material\nChristmas li...",2,0.471,light bulb,wintergreen light christma led light bulb pack 25,wtgr1011 featur nickel base 60 000 averag hour acryl resin bulb materi christma light bulb stead...


### 전처리된 데이터 저장

In [19]:
# train.to_csv('./data/preprocessed_train.csv', index=False)
# test.to_csv('./data/preprocessed_test.csv', index=False)