In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
base_path = '/content/drive/MyDrive/LG-Aimers/phase2'
df_train = pd.read_csv(f"{base_path}/train.csv") # 학습용 데이터
df_test = pd.read_csv(f"{base_path}/submission.csv") # 테스트 데이터(제출파일의 데이터)

# 기본 전처리 (범주형 특성에 대해서만 적용됨)
- 공백, 특수문자 모두 제거
- 대문자

In [None]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

special_characters = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

def remove_special_characters(text):
    for char in special_characters:
        text = text.replace(char, '')
    return text


# 범주형 특성은 모두 공백, 특수문자 제거 후 대문자 변환
for col in label_columns:
    df_train[col] = df_train[col].apply(
        lambda x: x if pd.isna(x) else remove_special_characters(x.replace(' ', '')).upper()
    )
    df_test[col] = df_test[col].apply(
        lambda x: x if pd.isna(x) else remove_special_characters(x.replace(' ', '')).upper()
    )

# inquiry_type 분석

1.6% 결측됨

## 상위 카테고리 찾기
- QUOTATIONORPURCHASECONSULTATION: 견적문의, 구매 상담
- CONSULTATION:
- REQUEST
- INQUIRY
- OTHERS

## 겹치는 값 찾기
- OTHER, OTHERS, OTHER_, NOTSPECIFIED, ETC.
- QUOTATION_OR_PURCHASE_CONSULATATION, QUOTATIONORPURCHASECONSULTATION, QUOTATION_, PURCHASEORQUOTATION
- USAGEORTECHNICALCONSULLTATION, USAGE_OR_TECHNICAL_CONSULTATION
- TECHNICALCONSULTATION, TECHNICAL_CONSULTATION

## 전처리 방법
- '_' 제거
- ETC로 통합
- value_counts()가 2 이하인거 ETC로 통합
- imputing: -1로 채우기, IterativeImputer, KNNImputer

아래 코드는 위의 전처리 과정 적용하는 코드. 전처리 적용해서 22개로 줄임

In [None]:
df_train_copy = df_train.copy()

df_train_copy['inquiry_type'] = df_train_copy['inquiry_type'].replace('OTHER', 'ETC').replace('OTHERS', 'ETC')
low_freq_values = df_train_copy['inquiry_type'].value_counts().index[df_train_copy['inquiry_type'].value_counts() <= 2]
for low_freq_val in low_freq_values.values:
    df_train_copy['inquiry_type'] = df_train_copy['inquiry_type'].replace(
        low_freq_val, 'ETC'
    )
print(f"개수: {len(df_train_copy['inquiry_type'].value_counts())}")
print(df_train_copy['inquiry_type'].value_counts())

개수: 22
QUOTATIONORPURCHASECONSULTATION    42138
SALESINQUIRY                        9981
ETC                                 1442
PRODUCTINFORMATION                  1237
USAGEORTECHNICALCONSULTATION        1190
TRAININGS                            434
TECHNICALCONSULTATION                433
SERVICES                             415
REQUESTFORPARTNERSHIP                297
REQUESTFORQUOTATIONORPURCHASE        230
REQUESTADEMO                         184
TECHNICALSUPPORT                     110
SALES                                100
REQUESTFORDISTRIBUTORSHIP             75
REQUESTFORTECHNICALCONSULTING         37
CUSTOMERSUGGESTIONS                   12
PURCHASEORQUOTATION                   10
IDB                                   10
TECHNICAL                              8
EVENTINQUIRY                           5
OEMODMREQUEST                          5
PURCHASE                               5
Name: inquiry_type, dtype: int64


### 값 하나하나 보기

In [None]:
df_train['inquiry_type'].value_counts()[:30]

QUOTATIONORPURCHASECONSULTATION                    42138
SALESINQUIRY                                        9981
PRODUCTINFORMATION                                  1237
USAGEORTECHNICALCONSULTATION                        1190
OTHER                                               1051
TRAININGS                                            434
TECHNICALCONSULTATION                                433
SERVICES                                             415
REQUESTFORPARTNERSHIP                                297
REQUESTFORQUOTATIONORPURCHASE                        230
ETC                                                  221
REQUESTADEMO                                         184
OTHERS                                               129
TECHNICALSUPPORT                                     110
SALES                                                100
REQUESTFORDISTRIBUTORSHIP                             75
REQUESTFORTECHNICALCONSULTING                         37
CUSTOMERSUGGESTIONS            

In [None]:
df_train['inquiry_type'].value_counts()[30:]

STANDALONE                                                                                        1
LEDSIGNAGE                                                                                        1
FORSCHOOL                                                                                         1
NOTSPECIFIED                                                                                      1
PRECISODEUMMONITORMÉDICOPARARADIOGRAFIACONVENCIONALETOMOGRTAFIA                                   1
INTÉGRATEURHISTORIQUEDUGEORGEV                                                                    1
VRF                                                                                               1
SOLICITOAPOYOPARAREALIZARCOTIZACIONDELOSDISPOSITIVOSQUEOFRECENENLASOLUCIÓN ONEQUICK               1
PANTALLASINTERACTIVASPARACLINICAS                                                                 1
HOTELTVPRODUCTS                                                                                   1


# expected_timeline(고객이 요청한 처리 일정) 분석

In [None]:
df_train['expected_timeline'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 59299 entries, 0 to 59298
Series name: expected_timeline
Non-Null Count  Dtype 
--------------  ----- 
28436 non-null  object
dtypes: object(1)
memory usage: 463.4+ KB


444개의 unique한 값

In [None]:
df_train['expected_timeline'].value_counts()

LESSTHAN3MONTHS                                                                                        17326
3MONTHS~6MONTHS                                                                                         5035
MORETHANAYEAR                                                                                           3027
9MONTHS~1YEAR                                                                                           1107
6MONTHS~9MONTHS                                                                                         1102
                                                                                                       ...  
DISCUSSEDWITHCLIENTDETAILSMAILEDTOCLIENTNOBUDGETSFORNOWWONTBUYHENCECLOSINGINTHESYSTEM                      1
OURPARTNERVISNETWORKSISWORKINGONTHISINQUIRYANDCLIENTISCROSSCHECKINGTHEPRICESHENCECLOSINGINTHESYSTEM        1
LOOKINGFORACTIVELEDINARANGEOF5LAKHSNOTAPROSPETIVEBUYER                                                     1
NOTANSWERINGCALL|AS

## 겹치는 값 찾기
- BEINGFOLLOWEDUP
  - BEINGFOLLOWEDUP.
  - FORWARDEDTOBDOBEINGFOLLOWEDUP
  - FUUNDERPROGRESS
  - CALLBACKLATER
  - ALREADYINTOUCHWITHHIMSINCELONGHEHASNOTFINALIZEDYETLEADFORWARDEDTORD
  - CUSTOMERBUDGETIS125+TAXFOR75INCHIDBIHAVEGIVENOURNOREGRETOFFERANDALSOEXPLAINEDWHYLGISBETTERANDPREMIUMHESAIDHEWILLCOMEBACKINADAYORTWO
  - FOLLOWINGUP
  - 이외에도 BEINGFOLLOWEDUP 또는 유사 텍스트가 포함된 값 다수 존재
- CLIENTISNOTHAVINGANYREQUIREMENT
  - THECLIENTISNOTHAVINGANYREQUIREMENTHENCECLOSIGINSYSTEMALTHOUGHTHEDETAILSOFIDBAREMAILEDTOCLIENT
  - THECLIENTISNOTHAVINGANYREQUIREMENTHEWASONLYBROWSINGTHROUGHTHEPRODUTHENCECLOSIGINSYSTEMALTHOUGHTHEDETAILSOFIDBAREMAILEDTOCLIENT
  - NOTINTERESTED
  - HECLIENTISNOTHAVINGANYREQUIREMENTHENCECLOSIGINSYSTEMALTHOUGHTHEDETAILSOFIDBAREMAILEDTOCLIENT
  - NOTINTERESTEDINCOMMERCIALTVS
  - 이외에도 NOTINTERESTED, NOTHAVINGREQUIREMENT 또는 유사 텍스트가 포함된 값 다수 존재
- COULDNTCONNECT
  - NOTANSWERINGCALL
  - DIDN'TRESPOND
  - NORESPONSE
  - TRIEDCOUPLEOFTIMESBUTHEISNOTRESPONDINGWEWILLTRYAGAIN
  - DROPNOWNOTINTRESTED
  - NOTREACHABLE
  - CUSTOMERPHONEISGETSSWITCHEDOFF
  - TRIEDTOREACHCUSTOMERBUTNORESPONSELEADFORWARDEDTOPARTNERFORFURTHERFOLLOWUP
  - CALLNOTCONNECTING
  - PHONESWTICHEDOFF
  - TRIEDTOREACHHIMBUTNORESPONSEIWILLCALLHIMAGAIN18THNOVICALLEDHIMAGAINANDHEWILLSENDTHEREQUIREMENTONMAIL
  - MOBILEISSWITCHEDOFFTRIED3TIMES
  - 이외에도 비슷한 뉘앙스의 값 다수 존재
- DETAILSSHARED
  - DETAILSSEND
  - DETAILSSHAREDWITHRDFORFURTHERACTION

## 상위 카테고리 찾기
- 기간이 올바르게 정해진 값
  - DAYS, MONTH, YEAR가 들어간 값
  - DETAILS가 들어간 값
- Follow up 되고 있는 값
  - FOLLOWEDUP, FOLLOWINGUP, FUUNDERPROGRESS, INTOUCH, CALLBACK
- 모종의 사유로 drop
  - DROP이 포함된 것
- 고객과 연락이 안되는 값
  - SWITCHEDOFF, NORESPON, NOTANSWER, NOTCONNECT, NOTREACH
  - NTRESPON, NTANSWER, NTCONNECT, NTREACH
- 기타

## feature 값 정리
- 45DAYS
- ONEMONTH
- OCTOBER2022
- LESSTHAN3MONTHS
  - LESSTHAN3MONTHSCUSTOMERNOTANSWEREDTOCALLBACK
- 3MONTHS~6MONTHS
  - LESSTHAN6MONTHS와 통합
  - LESSTHAN5MONTHS
  - MORETHEN3MONTHS
- 6MONTHS~9MONTHS
- 9MONTHS~1YEAR
- MORETHANAYEAR
- BEINGFOLLOWEDUP
- NOREQUIREMENT
- FIXED (이미 정해진 것. 단 기간이 안나와있는거)
  - DETAILSSHARED
  - ALREADYTOUCHWITHCUSTOMERS
- DROPPED
  - DROPSTILLNOPLANFORPURCHASE
  - DROPWRONGNUMBER
- ETC.
  - DUPLICATELEAD
  - QUOTESEND
  - RNR
  - COULDN'TCONNECT
  - BUDGETISSUE
  - NOTANSWERINGCALL
  - NOTANSWERINGCALLLEADSHAREDWITHRD
  - PRICESHARED
  - CONTACTDETAILSPROVIDEDAREWRONGSENTMAILFORCORRECTCONTACTDETAILS
  - REQUIREMENTOFIDBBUDGETISVERYLOWSHAREDDETAILSONWHATSAPP1UNITREQUIREDRDISFOLLOWINGUP
  - ALREADYWORKINGWITHENDUSERONTHISREQUIREMENT
  - CUSTOMEHASNOTRECEIVECALLASON31STMAYSPOKEWITHCUSTOMERHEWILLCOMEFORDEMOINNEXTWEEK14062022

## 전처리 방법
- 기간이 올바르게 정해진 값
  - DAYS, MONTH, YEAR가 들어간 값
  - 월, 일 이름이 들어간 값
  - DETAILS가 들어간 값
- Follow up 되고 있는 값
  - FOLLOWEDUP, FOLLOWINGUP, FUUNDERPROGRESS, INTOUCH, CALLBACK
- 모종의 사유로 drop
  - DROP이 포함된 것
- 고객과 연락이 안되는 값
  - SWITCHEDOFF, NORESPON, NOTANSWER, NOTCONNECT, NOTREACH
  - NTRESPON, NTANSWER, NTCONNECT, NTREACH
- 기한을 요구하지 않은 값
  - NOTHAVINGANYREQUIRE, NOTINTEREST,
- 기타

In [None]:
df_train_copy = df_train.copy()
def filter_value(text):
    allowed_words = [
        # 기간이 올바르게 정해진 값
        'DAY', 'MONTH', 'YEAR', 'DETAILSSHARED', 'DETAILSSEND',
        'JAN', 'FEB', 'MAR', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER', 'NOVEMBER', 'DECEMBER',

        # Follow up 되고 있는 값
        'FOLLOWEDUP', 'FOLLOWINGUP', 'FUUNDERPROGRESS', 'INTOUCH', 'CALLBACK',

        # drop 된 값
        'DROP', 'CLOSING'

        # 고객과 연락이 안되는 값
        'SWITCHEDOFF', 'NORESPON', 'NOTANSWER', 'NOTCONNECT', 'NOTREACH', 'NOTRESPON',
        'NTRESPON', 'NTANSWER', 'NTCON', 'NTREACH',

        # 기한을 요구하지 않은 값
        'NOTHAVINGANYREQUIRE', 'NOTINTEREST', 'NOREQ', 'NOINTEREST',
        'NOTREQ',

        # 고객과 논의된 값
        'DISCUSSEDWITHCLI'
    ]
    # 일단은 기준에 해당하면 변형하지 않고 출력
    for word in allowed_words:
        if word in text:
            return text
    # if text != 'ETC':
    #     print(f'Not satisfying conditions: {text}')
    return 'ETC'
df_train_copy['expected_timeline'].apply(
    lambda x: x if pd.isna(x) else filter_value(x)
).value_counts()

LESSTHAN3MONTHS                                                                                    17326
3MONTHS6MONTHS                                                                                      5035
MORETHANAYEAR                                                                                       3027
9MONTHS1YEAR                                                                                        1108
6MONTHS9MONTHS                                                                                      1102
                                                                                                   ...  
NOTANSWERINGCALLWILLTRYTOREACHHIMAGAIN                                                                 1
NOREQIREMENTS                                                                                          1
ASKEDTOCALLON4THMAY                                                                                    1
DISCUSSEDWITHCLIENTTHEYARENOTEVALUATUATINGIDBASOFNOWTHE

In [None]:
df_train_copy = df_train.copy()
def change_value(text):
    allowed_fixed = [
        # 기간이 올바르게 정해진 값
        'DAY', 'MONTH', 'YEAR',
        'JAN', 'FEB', 'MAR', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER', 'NOVEMBER', 'DECEMBER'
    ]
    allowed_follow = [
        # Follow up 되고 있는 값
        'FOLLOWEDUP', 'FOLLOWINGUP', 'FUUNDERPROGRESS', 'INTOUCH', 'CALLBACK'
    ]
    allowed_drop = [
        'DROP', 'CLOSING'
    ]
    allowed_not_connect = [
        # 고객과 연락이 안되는 값
        'SWITCHEDOFF', 'NORESPON', 'NOTANSWER', 'NOTCONNECT', 'NOTREACH', 'NOTRESPON',
        'NTRESPON', 'NTANSWER', 'NTCON', 'NTREACH'
    ]
    allowed_no_req = [
        # 기한을 요구하지 않은 값
        'NOTHAVINGANYREQUIRE', 'NOTINTEREST', 'NOREQ', 'NOINTEREST',
        'NOTREQ'
    ]
    allowed_discussed = [
        # 고객과 논의된 값
        'DISCUSSEDWITHCLI', 'DISCUSSEDWITHTHE'
    ]
    for word in allowed_fixed:
        if word in text:
            return text  # 'FIXED' 등으로 변경해야 함
    for word in allowed_follow:
        if word in text:
            return 'FOLLOWEDUP'
    for word in allowed_drop:
        if word in text:
            return 'DROPPED'
    for word in allowed_not_connect:
        if word in text:
            return 'NOTCONNECTED'
    for word in allowed_no_req:
        if word in text:
            return 'NOREQ'
    for word in allowed_discussed:
        if word in text:
            return 'DISCUSSEDWITHCLI'
    # if text != 'ETC':
    #     print(f'Not satisfying conditions: {text}')
    return 'ETC'
df_train_copy['expected_timeline'].apply(
    lambda x: x if pd.isna(x) else change_value(x)
).value_counts()

LESSTHAN3MONTHS                                          17326
3MONTHS6MONTHS                                            5035
MORETHANAYEAR                                             3027
9MONTHS1YEAR                                              1108
6MONTHS9MONTHS                                            1102
                                                         ...  
ASKEDTOCALLON4THMAY                                          1
LESSTHAN3MONTHSOUTDOORLEDREQUIMENT                           1
NEEDSHOTELTVAFTER4MONTHSWILLCALLUS                           1
DONTHAVEANYIMMEDIATEREQUIREMENTHEMAYPURCHASEAFTERJUNE        1
UPTODECEMBER                                                 1
Name: expected_timeline, Length: 76, dtype: int64

### 값 하나하나 보기

In [None]:
df_train['expected_timeline'].value_counts()[:50]

LESSTHAN3MONTHS                                                                                                                   17326
3MONTHS~6MONTHS                                                                                                                    5035
MORETHANAYEAR                                                                                                                      3027
9MONTHS~1YEAR                                                                                                                      1107
6MONTHS~9MONTHS                                                                                                                    1102
LESSTHAN6MONTHS                                                                                                                     108
ETC                                                                                                                                  95
BEINGFOLLOWEDUP                                 

In [None]:
df_train['expected_timeline'].value_counts()[50:100]

DROPSTILLNOPLANFORPURCHASE                                                                                                                                                                                        1
LESSTHAN3MONTHSCUSTOMERNOTANSWEREDTOCALLBACK                                                                                                                                                                      1
NOTINTERESTEDINCOMMERCIALTVS                                                                                                                                                                                      1
DROPWRONGNUMBER                                                                                                                                                                                                   1
CUSTOMERPHONEISGETSSWITCHEDOFF                                                                                                                          

In [None]:
df_train['expected_timeline'].value_counts()[100:150]

PURCHASEVUBRAND                                                                                                                                                                                                   1
CUSTOMERPURCHASEBENQFORPRICEDIFFERANCE                                                                                                                                                                            1
SPOKEWITHCUSTOMERHEISLOOKINGFORDOTLEDPRODUCTIN42INCHIHAVESUGGESTEDHIMOUR43INCHSIGNAGEBUTHEWANTFULLYOUTDOORLEDIHAVESHAREDTHEDETAILSOFOUR43INCHDISPLAY                                                              1
CUSTOMERHASNOTANSWERINGCALLCUSTOMERWANT86"INTERACTIVEDISPLAYHEWILLPURCHASEWITHINNEXT6MONTHWEAREFORWARDINGTOLOCALRDTOTAKETHISFURTHER                                                                               1
CLIENTHAVESEENTHEDEMOHENEEDEDFORBIGGERROOMANDTHECAMERAQUALITYANDSPEAKERTRACKINGWERETHEMAINREQUIREDFEATUREASOFNOWHISBUDGETISAROUND2LACSANDNEEDBIGGERSIZES

# response_corporate
- 나라 또는 도시별 있는 회사명이라 따로 전처리하지 않아도 될 듯함

# product_subcategory
## 특징
제품코드, 이름 모두 가지고 있음
- TR3BFSERIES: Interactive digital board

대부분이 모델 제품번호로 구성됨. product_modelname과 결측치가 똑같이 있는 것으로 보아 product_modelname으로 대체 가능할 수도 있을 듯

In [None]:
df_train['product_subcategory'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 59299 entries, 0 to 59298
Series name: product_subcategory
Non-Null Count  Dtype 
--------------  ----- 
9235 non-null   object
dtypes: object(1)
memory usage: 463.4+ KB


In [None]:
df_train['product_subcategory'].value_counts()

ALLMEDICALDISPLAYS            446
INTERACTIVEDIGITALBOARD       417
TR3BFSERIES                   374
ONEQUICKFLEX                  301
TRANSPARENTOLEDSIGNAGE        288
                             ... 
QHDMONITORS                     1
其他                              1
NOTAVAILABLEASASPLITSYSTEM      1
REQUIREDTHEMULTISPLIT           1
EVR                             1
Name: product_subcategory, Length: 318, dtype: int64

In [None]:
df_train['product_subcategory'].value_counts()[:50]

ALLMEDICALDISPLAYS                        446
INTERACTIVEDIGITALBOARD                   417
TR3BFSERIES                               374
ONEQUICKFLEX                              301
TRANSPARENTOLEDSIGNAGE                    288
DIAGNOSTICMONITORS                        284
TRANSPARENTLEDFILM                        280
ONEQUICKWORKS                             270
49500NITSFHDSLIMBEZELVIDEOWALL            268
TR3DJSERIES                               246
55500NITSFHD044MMEVENBEZELVIDEOWALL       209
55700NITSFHD044MMEVENBEZELVIDEOWALL       180
DIGITALXRAYDETECTORS                      167
UH5FHSERIES                               156
US660HSERIES                              151
4KUHDHOSPITALITYTVWITHPROCENTRICDIRECT    127
VL5FSERIES                                124
OUTDOORDISPLAY                            120
LGMAGNIT                                  114
WINDOWFACINGDISPLAY                       108
CURVABLEOLEDSIGNAGE                       102
CLINICALREVIEWMONITORS            

In [None]:
df_train['product_subcategory'].value_counts()[50:100]

US662HSERIES                      48
COMPACTSERIES                     48
LV35ASERIES                       47
NEWHIGHHAZEUHDSTANDARDSIGNAGE     46
PREMIUMSERIES                     45
INDOORVERSATILESERIES             42
PROCENTRICSMART                   42
UHDLARGESCREENSIGNAGEDISPLAY      40
55700NITSFHDSLIMBEZELVIDEOWALL    38
TC3DSERIES                        38
OPENFRAME                         37
OLEDPROMONITOR                    37
TOUCHOPENFRAME                    37
TR3BGSERIES                       35
UHDIRTYPETOUCHCREATEBOARD         35
ESSENTIALSERIES                   34
SM3GSERIES                        33
TA3ESERIES                        32
044MMEVENBEZELVIDEOWALL           32
UT640S                            32
ULTRALIGHTSERIES                  30
DIAGNOSTICMONITORS​               30
PROCENTRICSMARTSETTOPBOX          30
ALLMONITORS                       29
SUPERSIGNCONTROLCONTROL           28
SM5KESERIES                       28
UL3GSERIES                        28
U

In [None]:
df_train['product_subcategory'].value_counts()[100:150]

LT661HSERIES                                   16
LT330HSERIES                                   16
TOUCHOVERLAYKIT                                16
UT672MSERIES                                   16
UHD4KMONITORS                                  15
WEBOSBOX                                       15
09MMEVENBEZELVIDEOWALL                         15
LGLEDCURVESERIES                               15
PROCENTRICDIRECT                               15
UT660HSERIES                                   15
ONEQUICKWORKSFORZOOMROOMS                      14
NANOCELLHOSPITALTV                             14
FLOORSTANDFORONEQUICKFLEX                      14
ALLMEDICALDISPLAY                              13
LT572MSERIES                                   13
TOTALCARESOLUTION                              13
ESSENTIALCOMMERCIALTVWITH4KACTIVEHDR           13
WALLPAPEROLEDSIGNAGE                           13
SERIES                                         12
ESSENTIALCOMMERCIALTVWITHNANOCELLDISPLAY       12


In [None]:
df_train['product_subcategory'].value_counts()[150:200]

ALLPROJECTORS                         7
OLEDMONITORS                          7
SUPERSIGNSIMPLEEDITOR                 7
ERGONOMICMONITORS                     7
SMARTTOUCHSCREENTV                    7
UL3JSERIES                            7
ULTRAWIDE™MONITORS                    6
US665HSERIES                          6
LGCONNECTEDCARE                       6
WS960HSERIES                          6
LT662MSERIES                          6
SVH7PFSERIES                          6
UT670HSERIES                          6
DXD                                   6
ALLLAPTOPS                            6
LU766ASERIES                          6
UM3DGHSERIES                          5
UT781HSERIES                          5
OLED透明觸控顯示屏                           5
US772MSERIES                          5
HIGHBRIGHTNESSOPENFRAMEDISPLAY        5
SUPERSIGNMEDIAEDITOR                  5
VIDEOWALLOLED                         5
UT665HSERIES                          5
US770HSERIES                          5


In [None]:
df_train['product_subcategory'].value_counts()[200:250]

OPENFRAMEDISPLAY                        3
UT662MSERIES                            3
透明LED顯示貼                                3
TODOMEDICALDISPLAY                      3
US765HSERIES                            3
UM5JSERIES                              3
LGLEDALLINONELEDPREMIUMSERIES           3
SPLIT                                   2
WINDOWFACING                            2
TVMONITORS                              2
US760HSERIES                            2
WHITEBALANCE                            2
65TR3DJ                                 2
SM5JSERIES                              2
FLEXIBLEOPENFRAMEOLED                   2
LGTHERMALSENSINGTERMINAL                2
AWHP                                    2
SUPERSIGNWB                             2
UT567HSERIES                            2
SH7DD系列                                 2
OLEDPRO                                 2
CURVABLEOLED                            2
PROCENTRICVALUE                         2
ISC2                              