In [2]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 데이터 불러오기

In [3]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

FILEPATH = './modcloth_final_data.json'
data = pd.read_json(FILEPATH, lines=True)
data.head()

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,Emily,just right,small,991571,,,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,


## 컬럼 정보

item_id: unique product id

waist: waist measurement of customer

size: the standardized size of the **product**

quality: rating for the **product**

cup size: cup size measurement of customer

hips: hip measurement of customer

bra size: bra size of customer

category: the category of the **product**

bust: bust measurement of customer

height: height of the customer

length: feedback on the length of the **product**

fit: fit feedback

user_id: a unique id for the customer

shoe size: shoe size of the customer

shoe width: shoe width of the customer

review_text: review of customer

review_summary: review summary

In [4]:
df = data.copy()# 카피본 생성
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82790 entries, 0 to 82789
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         82790 non-null  int64  
 1   waist           2882 non-null   float64
 2   size            82790 non-null  int64  
 3   quality         82722 non-null  float64
 4   cup size        76535 non-null  object 
 5   hips            56064 non-null  float64
 6   bra size        76772 non-null  float64
 7   category        82790 non-null  object 
 8   bust            11854 non-null  object 
 9   height          81683 non-null  object 
 10  user_name       82790 non-null  object 
 11  length          82755 non-null  object 
 12  fit             82790 non-null  object 
 13  user_id         82790 non-null  int64  
 14  shoe size       27915 non-null  float64
 15  shoe width      18607 non-null  object 
 16  review_summary  76065 non-null  object 
 17  review_text     76065 non-null 

## 신발 리뷰 제외

In [5]:
shoe_df = df[df['shoe size'].isnull() == False]['shoe size']
df.drop(shoe_df.index, axis=0, inplace=True) # 행 제거
df.reset_index(drop=True, inplace=True)# 인덱스 재설정
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54875 entries, 0 to 54874
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         54875 non-null  int64  
 1   waist           1785 non-null   float64
 2   size            54875 non-null  int64  
 3   quality         54836 non-null  float64
 4   cup size        49705 non-null  object 
 5   hips            34202 non-null  float64
 6   bra size        49821 non-null  float64
 7   category        54875 non-null  object 
 8   bust            1488 non-null   object 
 9   height          54133 non-null  object 
 10  user_name       54875 non-null  object 
 11  length          54851 non-null  object 
 12  fit             54875 non-null  object 
 13  user_id         54875 non-null  int64  
 14  shoe size       0 non-null      float64
 15  shoe width      691 non-null    object 
 16  review_summary  50227 non-null  object 
 17  review_text     50227 non-null 

# 중복값 제거

In [6]:
# 중복데이터 제거
duplicates = df[df.duplicated()]
print(f"중복데이터 개수: {len(duplicates)}개" )
df.drop(duplicates.index, inplace=True) # 제거
df.reset_index(drop=True, inplace=True) # 인덱스 재설정
df.info()

중복데이터 개수: 252개
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54623 entries, 0 to 54622
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         54623 non-null  int64  
 1   waist           1784 non-null   float64
 2   size            54623 non-null  int64  
 3   quality         54584 non-null  float64
 4   cup size        49477 non-null  object 
 5   hips            34040 non-null  float64
 6   bra size        49592 non-null  float64
 7   category        54623 non-null  object 
 8   bust            1474 non-null   object 
 9   height          53881 non-null  object 
 10  user_name       54623 non-null  object 
 11  length          54599 non-null  object 
 12  fit             54623 non-null  object 
 13  user_id         54623 non-null  int64  
 14  shoe size       0 non-null      float64
 15  shoe width      689 non-null    object 
 16  review_summary  49986 non-null  object 
 17  review_text     

# 컬럼 선택

In [7]:
# 컬럼명 변경
df.rename(columns={'size':'item_size', 'cup size':'cup_size'}, inplace=True)
df.head()

# 필요한 컬럼만 남기기(학습 및 서비스 구현에 사용할 컬럼, 결측치가 너무 많지 않은 컬럼)
## 결측치가 너무 많은 컬럼 제외
df.drop(columns=['shoe size', 'shoe width', 'bust', 'waist'], inplace=True)
## 필요없다고 판단한 컬럼 제외
df.drop(columns=['bra size', 'quality'], inplace=True)
## 텍스트 데이터 제외
df.drop(columns=['user_name', 'review_summary', 'review_text'], inplace=True)
df.head()

Unnamed: 0,item_id,item_size,cup_size,hips,category,height,length,fit,user_id
0,123373,7,d,38.0,new,5ft 6in,just right,small,991571
1,123373,13,b,30.0,new,5ft 2in,just right,small,587883
2,123373,21,dd/e,,new,,just right,fit,875643
3,123373,18,b,,new,5ft 2in,slightly long,small,944840
4,123373,11,c,41.0,new,5ft 4in,just right,small,162012


# 결측치 제거

In [8]:
# 데이터 충분하다고 판단, 전부 제거
df.dropna(inplace=True) # 결측치 제거
df.reset_index(drop=True, inplace=True) # 인덱스 재설정
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32743 entries, 0 to 32742
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   item_id    32743 non-null  int64  
 1   item_size  32743 non-null  int64  
 2   cup_size   32743 non-null  object 
 3   hips       32743 non-null  float64
 4   category   32743 non-null  object 
 5   height     32743 non-null  object 
 6   length     32743 non-null  object 
 7   fit        32743 non-null  object 
 8   user_id    32743 non-null  int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 2.2+ MB


# 이상치 처리 및 데이터 타입 변경

hips => int

height => int (cm로 단위 변경)

bra_size => int (n인치로 단위변경)

length => int => category (카디널리티 5)

fit => int => category (카디널리티 3) ## target ##

user_id, item_id => str (고유값)

category => wedding 제거

## hips 컬럼

In [9]:
df.hips = df.hips.astype(int)
df.hips.head()

0    38
1    30
2    41
3    49
4    60
Name: hips, dtype: int64

## item_size 컬럼

0-27=> us_size # 28- => eu_size 로 보고 한국 사이즈로 변경

In [10]:
df.item_size.unique() #0-27=> us_size # 28- => eu_size

array([ 7, 13, 11, 24, 33,  3, 27,  9, 18, 30, 21,  5, 15, 12, 26,  8, 20,
        4, 32, 38,  1,  0,  6, 10, 17, 14,  2])

In [11]:
# KR = {0:33, 1:44, 2:55, 3:66, 4:77, 5:88, 6:big_size}
def size_corr(row):
    us_size = np.arange(0,31)
    eu_size = np.arange(31,60)
    if row in us_size: # 미국 사이즈라면
        if row == 0:
            row = 0
        elif row in np.arange(1,5):
            row = 1
        elif row in np.arange(5,9):
            row = 2
        elif row in np.arange(9,13):
            row = 3
        elif row in np.arange(13,17):
            row = 4
        else:
            row = 5    
    elif row in eu_size: # 유럽 사이즈라면
        if row in [31,32,33]:
            row = 0
        elif row in [34,35]:
            row = 0
        elif row in [36,37]:
            row = 1
        elif row in [38,39]:
            row = 2
        elif row in [40,41]:
            row = 3
        elif row in [42,43]:
            row = 4
        else:
            row = 5 
    return row

df.item_size = df.item_size.apply(size_corr)
df.head()

Unnamed: 0,item_id,item_size,cup_size,hips,category,height,length,fit,user_id
0,123373,2,d,38,new,5ft 6in,just right,small,991571
1,123373,4,b,30,new,5ft 2in,just right,small,587883
2,123373,3,c,41,new,5ft 4in,just right,small,162012
3,123373,5,d,49,new,5ft 6in,just right,small,205796
4,123373,0,ddd/f,60,new,5ft 4in,just right,small,422613


## height 컬럼

1ft = 30cm

1in = 2.5cm

In [12]:
df.height.unique()

array(['5ft 6in', '5ft 2in', '5ft 4in', '5ft 3in', '5ft 5in', '5ft',
       '5ft 7in', '5ft 9in', '5ft 8in', '5ft 1in', '5ft 11in', '4ft 11in',
       '4ft 10in', '5ft 10in', '6ft', '3ft 4in', '4ft 9in', '6ft 1in',
       '6ft 2in', '3ft', '4ft 7in', '7ft 11in', '3ft 11in', '4ft 8in',
       '6ft 5in', '3ft 3in', '6ft 3in', '6ft 4in', '6ft 8in', '6ft 6in',
       '7ft 7in', '4ft 2in'], dtype=object)

In [13]:
# 문자열 제거, cm 변환하는 함수
def height_processor(row):
    try:
        ft, inch = row.split()
        result = (int(ft.replace('ft', '')) * 30) + (int(inch.replace('in', '')) * 2.5)
        return result
    except:
        result = int(row.replace('ft', '')) * 30
        return result

df['height'] = df['height'].apply(height_processor)
df.sample(10)

Unnamed: 0,item_id,item_size,cup_size,hips,category,height,length,fit,user_id
10324,391519,3,d,34,dresses,177.5,slightly short,fit,746249
11664,400541,2,c,39,dresses,155.0,just right,fit,9817
16909,454030,1,b,35,tops,155.0,slightly long,large,689227
15660,418434,2,c,37,new,157.5,just right,fit,505160
1828,146231,5,c,35,new,152.5,slightly long,fit,413004
11506,400486,3,b,39,dresses,160.0,just right,fit,421127
9963,391519,3,dddd/g,40,dresses,165.0,just right,fit,359861
9800,384957,5,dd/e,49,sale,170.0,just right,small,291487
27653,715662,5,dd/e,44,bottoms,157.5,just right,fit,860161
28434,726492,1,d,34,bottoms,150.0,just right,fit,229376


## length 컬럼

In [14]:
# length 컬럼 살펴보기
df.length.unique()

array(['just right', 'very short', 'slightly long', 'slightly short',
       'very long'], dtype=object)

In [15]:
# 함수 정의
def length_processor(row): # 정도에 따라 0-4값 부여
    length = ['very short', 'slightly short', 'just right', 'slightly long', 'very long']
    row = length.index(row)
    return row
    
df.length = df.length.apply(length_processor)
df.length = df.length.astype('category') # 카데고리 타입으로 변경
df.head()  
    

Unnamed: 0,item_id,item_size,cup_size,hips,category,height,length,fit,user_id
0,123373,2,d,38,new,165.0,2,small,991571
1,123373,4,b,30,new,155.0,2,small,587883
2,123373,3,c,41,new,160.0,2,small,162012
3,123373,5,d,49,new,165.0,2,small,205796
4,123373,0,ddd/f,60,new,160.0,2,small,422613


## cup_size 컬럼

In [16]:
df.cup_size.unique()

array(['d', 'b', 'c', 'ddd/f', 'dd/e', 'dddd/g', 'i', 'a', 'h', 'aa', 'k',
       'j'], dtype=object)

In [17]:
def cup_process(row):
    sizes = ['aa', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
    if row in ['dd/e', 'ddd/f', 'dddd/g']:
        row = row.replace('d', '').replace('/', '')
    row = sizes.index(row)
    return row

df.cup_size = df.cup_size.apply(cup_process)
df.cup_size =df.cup_size.astype('category')
df.head()

Unnamed: 0,item_id,item_size,cup_size,hips,category,height,length,fit,user_id
0,123373,2,4,38,new,165.0,2,small,991571
1,123373,4,2,30,new,155.0,2,small,587883
2,123373,3,3,41,new,160.0,2,small,162012
3,123373,5,4,49,new,165.0,2,small,205796
4,123373,0,6,60,new,160.0,2,small,422613


## fit 컬럼

In [18]:
df.fit.unique()

array(['small', 'fit', 'large'], dtype=object)

In [19]:
def labeling(row):
    fit = ['small', 'fit', 'large']
    row = fit.index(row)
    return row

df.fit = df.fit.apply(labeling)
df.head()

Unnamed: 0,item_id,item_size,cup_size,hips,category,height,length,fit,user_id
0,123373,2,4,38,new,165.0,2,0,991571
1,123373,4,2,30,new,155.0,2,0,587883
2,123373,3,3,41,new,160.0,2,0,162012
3,123373,5,4,49,new,165.0,2,0,205796
4,123373,0,6,60,new,160.0,2,0,422613


## user_id, item_id 컬럼

In [20]:
df.user_id = df.user_id.astype(str)
df.item_id = df.item_id.astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32743 entries, 0 to 32742
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   item_id    32743 non-null  object  
 1   item_size  32743 non-null  int64   
 2   cup_size   32743 non-null  category
 3   hips       32743 non-null  int64   
 4   category   32743 non-null  object  
 5   height     32743 non-null  float64 
 6   length     32743 non-null  category
 7   fit        32743 non-null  int64   
 8   user_id    32743 non-null  object  
dtypes: category(2), float64(1), int64(3), object(3)
memory usage: 1.8+ MB


## wedding 컬럼

In [21]:
wedding = df.loc[df.category == 'wedding']
df.drop(wedding.index, axis=0, inplace=True) # 행 제거
df.reset_index(drop=True, inplace=True) # 인덱스 재설정

# csv파일로 저장

In [22]:
# df.to_csv('processed_df.csv', index=False)