In [1]:
import pandas as pd

In [2]:
csData = pd.read_csv("../exampleCode/dataset/customerdata.csv")

In [3]:
targetData = csData.loc[csData.EMI >2]
targetData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE
0,A13566,4273.9,3,6,1.679181,Big-Screen-lover
27,F17131,3179.708359,3,6,1.1,Early-bird
33,G16437,3787.328898,3,6,1.0,Early-bird
36,G18402,3402.199381,3,6,1.0,Early-bird
40,H16326,3988.935304,3,6,0.9,Early-bird
45,I12899,3018.895637,3,6,0.9,Early-bird
51,J10809,3676.135627,3,6,0.8,Early-bird
52,J15083,2811.265932,3,6,0.9,Early-bird
53,J16445,3041.848186,3,6,0.8,Early-bird
60,L13892,3263.515145,3,6,1.1,Early-bird


In [4]:
targetData.reset_index(drop=True, inplace=True)

## 1. numpy의 where 사용.

## 2. 딕셔너리를 직접 만들고, 그것을 .map 함수 사용하기.

## 3. sklearn의 LabelEncoder 사용하기.

## 4. 직접 함수를 만들고, 그것을 .apply 함수 사용하기. (데이터 타입을 그대로 사용 가능. 좋다!)

## 5. .loc 사용하기. (속도 느림)

# 1. numpy의 where 사용.

In [5]:
import numpy as np

### P.51 이상 데이터 정제하기

In [6]:
# np.where이 엑셀 if 와 동일하다. (조건, 참일 때, 거짓일 때)
csData["PRODUCTAGE_NEW"] = np.where(csData.PRODUCTAGE <= 1, 1, csData.PRODUCTAGE)
csData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,1.679181
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,2.682023
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3.208202
3,A16605,3713.211107,2,6,0.900000,Early-bird,1.000000
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,2.453656
...,...,...,...,...,...,...,...
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,2.463670
146,Z13534,3662.437527,3,6,0.900000,Early-bird,1.000000
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,2.371301
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,1.704942


### 검증 로직!!

In [19]:
# 정상적으로 실행이 되었다는 검증을 반드시 해줘야한다!!
# 이렇게 & 를 써주게 되면 앞의 조건 (csData.PRODUCTAGE <= 1) 은 데이터의 범위를 제한하고,
# 뒤의 조건 (csData.PRODUCTAGE_NEW != 1)은 앞의 조건이 제한한 범위에 일치하는지 조건을 부여한다.
# 즉, (csData.PRODUCTAGE <= 1) 인 애들은 전부 csData.PRODUCTAGE_NEW에 1로 변경이 되었다.
# 따라서, csData.PRODUCTAGE_NEW 이 1인 경우로 검증하면 100% 일치하게 나올거고,
# 반대로 1이 아닌 경우로 검증하면 0% 일치하게 해서 완전 불일치로 나와야한다.

csData[(csData.PRODUCTAGE <= 1) & (csData.PRODUCTAGE_NEW != 1)]

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW


In [7]:
# 어떻게 바뀌었길래 조회가 되지 않는지 보고 싶다면
csData[(csData.PRODUCTAGE <= 1) & (csData.PRODUCTAGE_NEW == 1)]

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW
3,A16605,3713.211107,2,6,0.9,Early-bird,1.0
6,B18816,3213.911446,2,6,0.9,Early-bird,1.0
13,D11795,2991.014703,2,6,0.8,Early-bird,1.0
23,E19404,3764.940414,2,6,0.9,Early-bird,1.0
24,F10222,3191.065822,2,6,0.9,Early-bird,1.0
30,G11453,2744.815881,2,6,0.9,Early-bird,1.0
33,G16437,3787.328898,3,6,1.0,Early-bird,1.0
36,G18402,3402.199381,3,6,1.0,Early-bird,1.0
37,G19717,2873.102202,2,6,0.8,Early-bird,1.0
38,H10188,4020.245162,2,6,0.9,Early-bird,1.0


### Quiz
csData["PRODUCTAGE_NEW"] = \ <br>
    np.where(csData.PRODUCTAGE <= 1, 1, csData.PRODUCTAGE)<br><br>

이후 1초과인 경우에 대해서 로직 검증 로직을 작성하세요.

### Key Point : 1 초과인 경우 값이 변경되지 않았다는 것을 검증해야한다.

In [20]:
# (csData.PRODUCTAGE > 1) 인 경우는 그대로 쓰기로 했으니까
# 1보다 큰 경우는 csData.PRODUCTAGE_NEW에 그대로 들어간다.
# 따라서, 앞의 조건 (csData.PRODUCTAGE > 1) 이 범위를 제한하고,
# 그 범위 안에서 'csData.PRODUCTAGE' 가 'csData.PRODUCTAGE_NEW'와 일치하면 데이터가 전부 나오고
# 불일치하면 데이터가 전혀 나오지 않아야 한다.

In [8]:
csData[(csData.PRODUCTAGE > 1) & (csData.PRODUCTAGE_NEW != csData.PRODUCTAGE)]

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW


만약 위에서 반대 케이스( <=1의 반대인 >1인 경우)를 생각하지 않고 이렇게 로직을 짜게 되면 위에 데이터가 나오겠지...<br>
csData["PRODUCTAGE_NEW"] = np.where(csData.PRODUCTAGE <= 1, 1, 0)<br>
csData

In [9]:
# 이건 100% 일치할테니 전부 나온다.
csData[(csData.PRODUCTAGE > 1) & (csData.PRODUCTAGE_NEW == csData.PRODUCTAGE)]

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,1.679181
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,2.682023
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3.208202
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,2.453656
5,B16849,3755.263391,2,4,2.786665,Sleeping-dog,2.786665
...,...,...,...,...,...,...,...
144,Z12772,3029.600000,2,4,2.093035,Big-Screen-lover,2.093035
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,2.463670
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,2.371301
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,1.704942


In [13]:
# 엑셀의 다중 조건문을 쓰는 방식과 동일.
# csData.PRODUCTAGE <= 1 이면 1로,
# 1 < csData.PRODUCTAGE <= 3 이면 3으로,
# csData.PRODUCTAGE > 3 이면 그대로...
csData["PRODUCTAGE_NEW"] = \
        np.where(csData.PRODUCTAGE <= 1, 1,
                 np.where(csData.PRODUCTAGE <= 3, 3, csData.PRODUCTAGE))
csData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,3.000000
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,3.000000
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3.208202
3,A16605,3713.211107,2,6,0.900000,Early-bird,1.000000
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3.000000
...,...,...,...,...,...,...,...
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,3.000000
146,Z13534,3662.437527,3,6,0.900000,Early-bird,1.000000
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,3.000000
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,3.000000


# 2. 딕셔너리를 직접 만들고, 그것을 .map 함수 사용하기.

In [14]:
csData.head()

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW
0,A13566,4273.9,3,6,1.679181,Big-Screen-lover,3.0
1,A14219,3642.44195,2,4,2.682023,Sleeping-dog,3.0
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3.208202
3,A16605,3713.211107,2,6,0.9,Early-bird,1.0
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3.0


In [15]:
# CUSTTYPE 조회
csData.CUSTTYPE

0      Big-Screen-lover
1          Sleeping-dog
2          Sleeping-dog
3            Early-bird
4          Sleeping-dog
             ...       
145    Big-Screen-lover
146          Early-bird
147    Big-Screen-lover
148    Big-Screen-lover
149          Early-bird
Name: CUSTTYPE, Length: 150, dtype: object

In [16]:
# CUSTTYPE 조회 (중복 제거)
csData.CUSTTYPE.drop_duplicates()

0    Big-Screen-lover
1        Sleeping-dog
3          Early-bird
Name: CUSTTYPE, dtype: object

In [21]:
typeMap = {'Big-Screen-lover':0,
          'Sleeping-dog':1,
          'Early-bird':2}

In [19]:
typeMap

{'Big-Screen-lover': 0, 'Sleeping-dog': 1, 'Early-bird': 2}

In [27]:
csData['CUSTTYPE_NEW'] = csData.CUSTTYPE.map(typeMap)    # mapping : 괄호 안에 딕셔너리 형태를 넣어주면 앞에꺼를 그 형태에 맞게 매핑시켜준다.
csData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW,CUSTTYPE_NEW
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,3.000000,0
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,3.000000,1
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3.208202,1
3,A16605,3713.211107,2,6,0.900000,Early-bird,1.000000,2
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3.000000,1
...,...,...,...,...,...,...,...,...
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,3.000000,0
146,Z13534,3662.437527,3,6,0.900000,Early-bird,1.000000,2
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,3.000000,0
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,3.000000,0


In [22]:
csData['CUSTTYPE_NEW'] = csData.CUSTTYPE.map(typeMap)

In [24]:
csData.corr()    # 상관 관게 조회하기 (seaborn 수업때 햇음.)

Unnamed: 0,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,PRODUCTAGE_NEW,CUSTTYPE_NEW
AVGPRICE,1.0,0.237539,0.06916,0.282558,0.30565,-0.305342
EMI,0.237539,1.0,0.487092,-0.348045,-0.261191,0.239052
DEVICECOUNT,0.06916,0.487092,1.0,-0.700726,-0.638511,0.370031
PRODUCTAGE,0.282558,-0.348045,-0.700726,1.0,0.809729,-0.632862
PRODUCTAGE_NEW,0.30565,-0.261191,-0.638511,0.809729,1.0,-0.743607
CUSTTYPE_NEW,-0.305342,0.239052,0.370031,-0.632862,-0.743607,1.0


### 3개니까 딕셔너리를 그냥 바로 썼는데 이게 100개라면...?

# 3. sklearn의 LabelEncoder 사용하기.

In [39]:
from sklearn.preprocessing import LabelEncoder

In [40]:
# columnList = [] 하던 과정과 같은거다. 자료형을 정하고 초기화 시키는거!!
cstypeEn = LabelEncoder()

In [41]:
# sklearn의 LaberEncoder를 사용해서 위에 수동으로 {'Big-Screen-lover': 0, 'Sleeping-dog': 1, 'Early-bird': 2}를
# 매핑해주던 것을 같은 것끼리 찾아서 알어서 매핑해준다.
cstypeEn.fit_transform(csData.CUSTTYPE)

array([0, 2, 2, 1, 2, 2, 1, 0, 0, 2, 2, 0, 0, 1, 2, 0, 2, 2, 0, 0, 2, 0,
       0, 1, 1, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 0,
       1, 1, 2, 2, 1, 0, 0, 1, 1, 1, 0, 1, 2, 2, 0, 0, 1, 1, 1, 0, 1, 2,
       0, 1, 1, 0, 2, 1, 2, 2, 1, 2, 0, 1, 1, 1, 1, 2, 1, 2, 1, 0, 2, 1,
       0, 2, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 2, 0, 2, 2, 0, 1, 0, 0, 2, 0,
       2, 2, 2, 0, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 2, 0, 1, 1,
       1, 1, 0, 1, 2, 0, 2, 0, 2, 0, 1, 1, 0, 0, 1, 0, 0, 1])

In [42]:
# CUSTTYPE_NEW는 아까 수동 매핑한거, CUSTTYPE_LE는 LabelEncoder가 알아서 매핑한거.
# 기본적으로 ABC 순으로 매핑하는 것 같다. (내가 정할 수는 없다. 어차피 내가 정할거면 위에 딕셔너리 직접 만드는 것을 해야하니 의미가 없음...)
csData["CUSTTYPE_LE"] = cstypeEn.fit_transform(csData.CUSTTYPE)
csData.head()

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,CUSTTYPE_LE
0,A13566,4273.9,3,6,1.679181,Big-Screen-lover,0
1,A14219,3642.44195,2,4,2.682023,Sleeping-dog,2
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,2
3,A16605,3713.211107,2,6,0.9,Early-bird,1
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,2


In [43]:
# 숫자로 매핑했던 것을 다시 역으로 매핑시켜준다.
csData["CUSTTYPE_DE"] = cstypeEn.inverse_transform(csData["CUSTTYPE_LE"])
csData.head()

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,CUSTTYPE_LE,CUSTTYPE_DE
0,A13566,4273.9,3,6,1.679181,Big-Screen-lover,0,Big-Screen-lover
1,A14219,3642.44195,2,4,2.682023,Sleeping-dog,2,Sleeping-dog
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,2,Sleeping-dog
3,A16605,3713.211107,2,6,0.9,Early-bird,1,Early-bird
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,2,Sleeping-dog


In [44]:
# 기존의 CUSTTPYE은 당연히 LabelEncoder 를 사용해서 매핑, 역매핑 한 값과 다르면 안 된다. 다른게 없으니까 아무것도 조회되지 않는게 정상!!
csData.loc[csData.CUSTTYPE != csData.CUSTTYPE_DE]

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,CUSTTYPE_LE,CUSTTYPE_DE


In [45]:
targetColumns = ['CUSTTYPE','CUSTTYPE_LE']

In [46]:
csData.loc[:,targetColumns].drop_duplicates()

Unnamed: 0,CUSTTYPE,CUSTTYPE_LE
0,Big-Screen-lover,0
1,Sleeping-dog,2
3,Early-bird,1


In [47]:
# 또 다른 방법... get_dumpies 사용하기.
csData = pd.read_csv('../exampleCode/dataset/customerdata.csv')
pd.get_dummies(csData.CUSTTYPE)

Unnamed: 0,Big-Screen-lover,Early-bird,Sleeping-dog
0,1,0,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,0,1
...,...,...,...
145,1,0,0
146,0,1,0
147,1,0,0
148,1,0,0


# 4. 직접 함수를 만들고, 그것을 .apply 함수 사용하기.

In [70]:
def normalizeFunc(inValue):
    # inValue = 1.2

    outValue = 0

    if inValue < 1:
        outValue = 1
    elif inValue < 2:
        outValue = 2
    elif inValue < 3:
        outValue = 3
    else:
        outValue = inValue
    return outValue    

In [72]:
csData['EMI_NEW'] = csData.EMI.apply(normalizeFunc)
csData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,EMI_NEW,PRODUCTAGE_NEW
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,3,2.000000
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,3,3.000000
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3,3.208202
3,A16605,3713.211107,2,6,0.900000,Early-bird,3,1.000000
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3,3.000000
...,...,...,...,...,...,...,...,...
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,3,3.000000
146,Z13534,3662.437527,3,6,0.900000,Early-bird,3,1.000000
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,3,3.000000
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,3,2.000000


# 5. .loc 사용하기.

In [73]:
csData.loc[csData.PRODUCTAGE < 1].head()

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,EMI_NEW,PRODUCTAGE_NEW
3,A16605,3713.211107,2,6,0.9,Early-bird,3,1.0
6,B18816,3213.911446,2,6,0.9,Early-bird,3,1.0
13,D11795,2991.014703,2,6,0.8,Early-bird,3,1.0
23,E19404,3764.940414,2,6,0.9,Early-bird,3,1.0
24,F10222,3191.065822,2,6,0.9,Early-bird,3,1.0


In [74]:
# 위에 함수 만든 것과 동일한 로직을 .loc 으로 구현한 것.
csData.loc[csData.PRODUCTAGE < 1, "PRODUCTAGE_NEW"] = 1
csData.loc[(csData.PRODUCTAGE >= 1) & (csData.PRODUCTAGE < 2), "PRODUCTAGE_NEW"] = 2
csData.loc[(csData.PRODUCTAGE >= 2) & (csData.PRODUCTAGE < 3), "PRODUCTAGE_NEW"] = 3
csData.loc[csData.PRODUCTAGE >= 3, "PRODUCTAGE_NEW"] = csData.PRODUCTAGE
csData.head()

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,EMI_NEW,PRODUCTAGE_NEW
0,A13566,4273.9,3,6,1.679181,Big-Screen-lover,3,2.0
1,A14219,3642.44195,2,4,2.682023,Sleeping-dog,3,3.0
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3,3.208202
3,A16605,3713.211107,2,6,0.9,Early-bird,3,1.0
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3,3.0


In [75]:
csData['KEYVAL'] = csData['CUSTID'] + '_' + csData['CUSTTYPE']
csData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,EMI_NEW,PRODUCTAGE_NEW,KEYVAL
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,3,2.000000,A13566_Big-Screen-lover
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,3,3.000000,A14219_Sleeping-dog
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3,3.208202,A15312_Sleeping-dog
3,A16605,3713.211107,2,6,0.900000,Early-bird,3,1.000000,A16605_Early-bird
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3,3.000000,B10634_Sleeping-dog
...,...,...,...,...,...,...,...,...,...
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,3,3.000000,Z13253_Big-Screen-lover
146,Z13534,3662.437527,3,6,0.900000,Early-bird,3,1.000000,Z13534_Early-bird
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,3,3.000000,Z16428_Big-Screen-lover
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,3,2.000000,Z16735_Big-Screen-lover


In [97]:
# 아래와 같다. 하지만 데이터 타입 통합을 하지 않았을 경우는 이렇게 astype까지 써줘야한다.
# str[1:]로 하는 이유는 문자열이지만 판다스 데이터프레임 내 문자열은 파이썬의 문자열로 인식을 정확히 하지 못 한다.
csData['KEYVAL'] = csData['CUSTID'].astype(str).str[1:] + '_' + csData['CUSTTYPE']
csData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,EMI_NEW,PRODUCTAGE_NEW,KEYVAL
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,3,2.000000,13566_Big-Screen-lover
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,3,3.000000,14219_Sleeping-dog
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3,3.208202,15312_Sleeping-dog
3,A16605,3713.211107,2,6,0.900000,Early-bird,3,1.000000,16605_Early-bird
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3,3.000000,10634_Sleeping-dog
...,...,...,...,...,...,...,...,...,...
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,3,3.000000,13253_Big-Screen-lover
146,Z13534,3662.437527,3,6,0.900000,Early-bird,3,1.000000,13534_Early-bird
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,3,3.000000,16428_Big-Screen-lover
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,3,2.000000,16735_Big-Screen-lover


In [95]:
# 처음에 데이터 타입을 통합했다면 astype은 빼고 코드를 작성하는게 좋다.
csData['KEYVAL'] = csData['CUSTID'].str[1:] + '_' + csData['CUSTTYPE']
csData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,EMI_NEW,PRODUCTAGE_NEW,KEYVAL
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,3,2.000000,13566_Big-Screen-lover
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,3,3.000000,14219_Sleeping-dog
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3,3.208202,15312_Sleeping-dog
3,A16605,3713.211107,2,6,0.900000,Early-bird,3,1.000000,16605_Early-bird
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3,3.000000,10634_Sleeping-dog
...,...,...,...,...,...,...,...,...,...
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,3,3.000000,13253_Big-Screen-lover
146,Z13534,3662.437527,3,6,0.900000,Early-bird,3,1.000000,13534_Early-bird
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,3,3.000000,16428_Big-Screen-lover
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,3,2.000000,16735_Big-Screen-lover


In [99]:
# str을 하지 않았을 경우... (잘못된 예)
csData['KEYVAL'] = csData['CUSTID'][1:] + '_' + csData['CUSTTYPE']
csData

Unnamed: 0,CUSTID,AVGPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,EMI_NEW,PRODUCTAGE_NEW,KEYVAL
0,A13566,4273.900000,3,6,1.679181,Big-Screen-lover,3,2.000000,
1,A14219,3642.441950,2,4,2.682023,Sleeping-dog,3,3.000000,A14219_Sleeping-dog
2,A15312,3653.884565,2,5,3.208202,Sleeping-dog,3,3.208202,A15312_Sleeping-dog
3,A16605,3713.211107,2,6,0.900000,Early-bird,3,1.000000,A16605_Early-bird
4,B10634,3391.074215,2,4,2.453656,Sleeping-dog,3,3.000000,B10634_Sleeping-dog
...,...,...,...,...,...,...,...,...,...
145,Z13253,3678.800000,2,5,2.463670,Big-Screen-lover,3,3.000000,Z13253_Big-Screen-lover
146,Z13534,3662.437527,3,6,0.900000,Early-bird,3,1.000000,Z13534_Early-bird
147,Z16428,3516.500000,2,5,2.371301,Big-Screen-lover,3,3.000000,Z16428_Big-Screen-lover
148,Z16735,3300.100000,2,5,1.704942,Big-Screen-lover,3,2.000000,Z16735_Big-Screen-lover


In [83]:
a=3
if(1 < a < 6):
    print('True')
else:
    print('Flase')

True


In [109]:
type(csData.PRODUCTAGE)

pandas.core.series.Series

In [132]:
# 파이썬 자료형이 아니라 판다스 자료형이라 이런식으로는 안 된다. (넘파이로 생성한 숫자가 판다스로 연산이나 판단 로직이 제대로 돌지 않던 거랑 동일하다.)
csData.loc[csData.PRODUCTAGE < 1, "PRODUCTAGE_NEW"] = 1
csData.loc[1 <= csData.PRODUCTAGE < 2, "PRODUCTAGE_NEW"] = 2
csData.loc[2 <= csData.PRODUCTAGE < 3, "PRODUCTAGE_NEW"] = 3
csData.loc[3 <= csData.PRODUCTAGE, "PRODUCTAGE_NEW"] = csData.PRODUCTAGE
csData.head()

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

# P.61 프로젝트 실습
### 1. 불량 데이터 처리
### kopo_channel_seasonality_new.csv 자료를 담은 selloutData 변수에서 QTY 컬럼이 음수(반품)인 경우는 0으로, 양수인 경우는 기존 QTY값을 유지하는 로직을 적용하여 QTY_NEW 컬럼을 추가하세요.

In [136]:
# 1. 데이터 불러오고 확인하기.
import pandas as pd

url = '../exampleCode/dataset/kopo_channel_seasonality_new.csv'
selloutData = pd.read_csv(url)

selloutData.head()

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY
0,A60,PRODUCT4,201402,71.0
1,A60,PRODUCT59,201402,22275.0
2,A60,PRODUCT34,201402,4463.0
3,A60,PRODUCT47,201402,0.0
4,A60,PRODUCT56,201402,23.0


In [137]:
selloutData.dtypes

REGIONID     object
PRODUCT      object
YEARWEEK      int64
QTY         float64
dtype: object

In [138]:
# 2. apply 함수 적용을 위한 로직을 함수로 구현하기.
def qtyNormalize(inValue):
    # inValue = -3
    outVlaue = 0

    if inValue < 0:
        outValue = 0
    else:
        outValue = inValue
    
    return outValue

In [139]:
# 3. 위에서 만든 함수를 이용해 apply 함수 적용하기.
selloutData['QTY_NEW'] = selloutData.QTY.apply(qtyNormalize)
selloutData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW
0,A60,PRODUCT4,201402,71.0,71.0
1,A60,PRODUCT59,201402,22275.0,22275.0
2,A60,PRODUCT34,201402,4463.0,4463.0
3,A60,PRODUCT47,201402,0.0,0.0
4,A60,PRODUCT56,201402,23.0,23.0
...,...,...,...,...,...
124653,A10,PRODUCT60,201630,824.0,824.0
124654,A10,PRODUCT56,201630,275.0,275.0
124655,A10,PRODUCT61,201630,0.0,0.0
124656,A10,PRODUCT12,201630,15021.0,15021.0


In [140]:
# 4. 검증 로직
selloutData[(selloutData.QTY < 0) & (selloutData.QTY_NEW != 0)]

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW


### 2. 데이터 통합
### selloutData 자료에서 YEAR, WEEK 컬럼을 생성하고 WEEK 가 52 이하인 데이터만 조회한 후 refinedSelloutData 변수에 담으세요.

In [93]:
# 1. 데이터 불러오고 확인하기.
import pandas as pd

url = '../exampleCode/dataset/kopo_channel_seasonality_new.csv'
selloutData = pd.read_csv(url)

selloutData.tail()

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY
124653,A10,PRODUCT60,201630,824.0
124654,A10,PRODUCT56,201630,275.0
124655,A10,PRODUCT61,201630,0.0
124656,A10,PRODUCT12,201630,15021.0
124657,A10,PRODUCT1,201630,568.0


In [84]:
# 사용 방법
def yearweekSeparation(inValue, option):
    # inValue = 201402
    inValue = str(inValue)
    
    # 1년, 999년, 10000년에 대응하기 위해서 마이너스 인덱스 사용.
    year = inValue[:-2]
    week = inValue[-2:]
    if option =="year":
        value = year
    else:
        value = week
    return value

In [92]:
selloutData["YEAR"] = selloutData.YEARWEEK.apply(yearweekSeparation, option = "year")
selloutData["WEEK"] = selloutData.YEARWEEK.apply(yearweekSeparation, option = "")
selloutData.tail()

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,YEAR,WEEK
124653,A10,PRODUCT60,201630,824.0,2016,30
124654,A10,PRODUCT56,201630,275.0,2016,30
124655,A10,PRODUCT61,201630,0.0,2016,30
124656,A10,PRODUCT12,201630,15021.0,2016,30
124657,A10,PRODUCT1,201630,568.0,2016,30


In [91]:
refinedSelloutData = selloutData.loc[selloutData.WEEK.astype(int) <= 52]
refinedSelloutData.tail()

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,YEAR,WEEK
124653,A10,PRODUCT60,201630,824.0,2016,30
124654,A10,PRODUCT56,201630,275.0,2016,30
124655,A10,PRODUCT61,201630,0.0,2016,30
124656,A10,PRODUCT12,201630,15021.0,2016,30
124657,A10,PRODUCT1,201630,568.0,2016,30


### --------------------------------------- 메 모 ---------------------------------------

In [73]:
# 1. 데이터 불러오고 확인하기.
import pandas as pd

url = '../exampleCode/dataset/kopo_channel_seasonality_new.csv'
selloutData = pd.read_csv(url)

selloutData.head()

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY
0,A60,PRODUCT4,201402,71.0
1,A60,PRODUCT59,201402,22275.0
2,A60,PRODUCT34,201402,4463.0
3,A60,PRODUCT47,201402,0.0
4,A60,PRODUCT56,201402,23.0


In [74]:
# 리스트로 리턴시키는 방법... (함수 자체는 잘 작동되는데 )
# selloutData["YEAR"] = selloutData.YEARWEEK.apply( yearweekSeparation ) 에 넣을 때 문제가 발생...
# concat을 이용해야함. 나중에 배움.
def yearweekSeparation(inValue):
    # inValue = 201402
    inValue = str(inValue)
    
    # 1년, 999년, 10000년에 대응하기 위해서 마이너스 인덱스 사용.
    year = inValue[:-2]
    week = inValue[-2:]

    return [year,week]

In [75]:
yearweekSep = selloutData.YEARWEEK.apply(yearweekSeparation)
yearweekSep

0         [2014, 02]
1         [2014, 02]
2         [2014, 02]
3         [2014, 02]
4         [2014, 02]
             ...    
124653    [2016, 30]
124654    [2016, 30]
124655    [2016, 30]
124656    [2016, 30]
124657    [2016, 30]
Name: YEARWEEK, Length: 124658, dtype: object

In [76]:
yearweekSepList = yearweekSep.to_list()

In [80]:
yearweekSepDF = pd.DataFrame(yearweekSepList, columns = ['YEAR', 'WEEK'])
result = pd.concat([selloutData, yearweekSepDF], axis =1)
result

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,YEAR,WEEK
0,A60,PRODUCT4,201402,71.0,2014,02
1,A60,PRODUCT59,201402,22275.0,2014,02
2,A60,PRODUCT34,201402,4463.0,2014,02
3,A60,PRODUCT47,201402,0.0,2014,02
4,A60,PRODUCT56,201402,23.0,2014,02
...,...,...,...,...,...,...
124653,A10,PRODUCT60,201630,824.0,2016,30
124654,A10,PRODUCT56,201630,275.0,2016,30
124655,A10,PRODUCT61,201630,0.0,2016,30
124656,A10,PRODUCT12,201630,15021.0,2016,30


In [44]:
# 사용 방법
def yearweekSeparation(inValue, option, arg2_preFix):
    # inValue = 201402
    arg2_preFix = arg2_preFix
    inValue = str(inValue)
    
    # 1년, 999년, 10000년에 대응하기 위해서 마이너스 인덱스 사용.
    year = inValue[:-2]
    week = inValue[-2:]
    if option =="year":
        value = year
    else:
        value = week
    return arg2_preFix + value

In [50]:
selloutData["YEAR"] = selloutData.YEARWEEK.apply( yearweekSeparation, option = "year", arg2_preFix = "Hello_")

In [51]:
selloutData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,YEAR
0,A60,PRODUCT4,201402,71.0,Hello_2014
1,A60,PRODUCT59,201402,22275.0,Hello_2014
2,A60,PRODUCT34,201402,4463.0,Hello_2014
3,A60,PRODUCT47,201402,0.0,Hello_2014
4,A60,PRODUCT56,201402,23.0,Hello_2014
...,...,...,...,...,...
124653,A10,PRODUCT60,201630,824.0,Hello_2016
124654,A10,PRODUCT56,201630,275.0,Hello_2016
124655,A10,PRODUCT61,201630,0.0,Hello_2016
124656,A10,PRODUCT12,201630,15021.0,Hello_2016


### --------------------------------------- 메 모 ---------------------------------------

In [158]:
selloutData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,YEAR
0,A60,PRODUCT4,201402,71.0,"[2014, 02]"
1,A60,PRODUCT59,201402,22275.0,"[2014, 02]"
2,A60,PRODUCT34,201402,4463.0,"[2014, 02]"
3,A60,PRODUCT47,201402,0.0,"[2014, 02]"
4,A60,PRODUCT56,201402,23.0,"[2014, 02]"
...,...,...,...,...,...
124653,A10,PRODUCT60,201630,824.0,"[2016, 30]"
124654,A10,PRODUCT56,201630,275.0,"[2016, 30]"
124655,A10,PRODUCT61,201630,0.0,"[2016, 30]"
124656,A10,PRODUCT12,201630,15021.0,"[2016, 30]"


In [166]:
# 2. apply 함수 적용을 위한 로직을 함수로 구현하기.

def yearweekToYear(inValue):
    # inValue = 201402
    
    inValue = str(inValue)
    
    # 1년, 999년, 10000년에 대응하기 위해서 마이너스 인덱스 사용.
    year = inValue[:-2]
    
    return year

def yearweekToWeek(inValue):
    # inValue = 201402
    
    inValue = str(inValue)
    
    # 1년, 999년, 10000년에 대응하기 위해서 마이너스 인덱스 사용.
    year = inValue[-2:]
    
    return year

In [169]:
# 3. 위에서 만든 함수를 이용해 apply 함수 적용하기.

selloutData['YEAR'] = selloutData.YEARWEEK.apply(yearweekToYear)
selloutData['WEEK'] = selloutData.YEARWEEK.apply(yearweekToWeek)
selloutData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,YEAR,WEEK
0,A60,PRODUCT4,201402,71.0,2014,02
1,A60,PRODUCT59,201402,22275.0,2014,02
2,A60,PRODUCT34,201402,4463.0,2014,02
3,A60,PRODUCT47,201402,0.0,2014,02
4,A60,PRODUCT56,201402,23.0,2014,02
...,...,...,...,...,...,...
124653,A10,PRODUCT60,201630,824.0,2016,30
124654,A10,PRODUCT56,201630,275.0,2016,30
124655,A10,PRODUCT61,201630,0.0,2016,30
124656,A10,PRODUCT12,201630,15021.0,2016,30


In [None]:
selloutData.loc[ : , selloutData.YEARWEEK]

In [None]:
# 4. 검증 로직



selloutData[(selloutData.QTY < 0) & (selloutData.QTY_NEW != 0)]

In [None]:
# apply
def normalizeFunc(inValue):
    # inValue = 1.2

    outValue = 0

    if inValue < 1:
        outValue = 1
    elif inValue < 2:
        outValue = 2
    elif inValue < 3:
        outValue = 3
    else:
        outValue = inValue
    return outValue    

csData['EMI_NEW'] = csData.EMI.apply(normalizeFunc)
csData

In [None]:
# map
typeMap = {'Big-Screen-lover':0,
          'Sleeping-dog':1,
          'Early-bird':2}
csData['CUSTTYPE_NEW'] = csData.CUSTTYPE.map(typeMap)

In [None]:
# 검증 로직
csData[(csData.PRODUCTAGE <= 1) & (csData.PRODUCTAGE_NEW != 1)]