In [2]:
import pandas as pd

## DDA
- 데이터의 구조와 타입을 확인 (정형 데이터의 구조 확인 Index - Columns - Values)
    - Index : 데이터의 순서 / 데이터의 개수를 확인 (데이터의 개수에 따라 분석 기법이 달라짐)
    - Columns : 데이터의 항목 / 각 데이터 항목의 타입 확인 (데이터 타입에 따라 분석 기법이 달라짐)
        - 연속형 : int / float
        - 범주형 : str / object
        - 날짜형/순서형 : datetime --> 실무에서 중요. 주차별 판매량, 평일 판매량 등
    - Value : 데이터 값 / 데이터의 형식 확인
        - EX. 날짜 데이터를 2024-05-09 VS 2024년 5월 9일 등 형식이 다를 수 있음 -> Python에서 데이터 전처리/분석 할 때 오류 발생의 원인
        - 결측값 (Missing Value) : 데이터 수집이나 처리과정에서 누락이나 오류로 인해 발생한 비어있는 값 (None, NaN, Null, Na 등으로 표현) 

In [3]:
df1 = pd.read_csv('data/01_Data.csv')
print(df1.shape)
df1.head()

(51301, 20)


Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
0,1,66758234,렌탈,일반계약,영업방판,2019-05-06,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,9.0,새마을금고
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,2.0,현대카드
2,3,66756657,렌탈,일반계약,홈쇼핑/방송,2019-02-28,60,CMS,DES-1,96900,개인,48.0,경기도,경기도,계약확정,0,없음,여자,8.0,우리은행
3,4,66423450,멤버십,멤버십3유형,재계약,2019-05-13,12,CMS,DES-1,66900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,5.0,농협회원조합
4,5,66423204,멤버십,멤버십3유형,재계약,2019-05-10,12,CMS,DES-1,66900,개인,60.0,경기도,경기도,기간만료,12,있음,남자,8.0,농협회원조합


In [None]:
df1.info

In [4]:
# 기술통계량확인
df1.describe()

Unnamed: 0,Index,Member_ID,Term,Amount_Month,Age,Overdue_count,Credit_Rank
count,51301.0,51301.0,51301.0,51301.0,44329.0,51301.0,42520.0
mean,25651.703612,62664320.0,55.639149,93994.974289,50.024093,0.161381,3.42881
std,14809.828628,12161460.0,12.009915,15304.263988,10.983877,1.122193,2.213453
min,1.0,25687980.0,12.0,54603.0,25.0,0.0,0.0
25%,12826.0,66431480.0,60.0,81900.0,42.0,0.0,1.0
50%,25652.0,66765780.0,60.0,96900.0,49.0,0.0,3.0
75%,38477.0,66781600.0,60.0,98400.0,57.0,0.0,5.0
max,51302.0,66969860.0,60.0,215700.0,102.0,15.0,10.0


In [5]:
df1['State'].unique()

array(['계약확정', '기간만료', '해약확정', '해약진행중'], dtype=object)

In [6]:
df1['State'].value_counts()

State
계약확정     50620
해약확정       622
기간만료        45
해약진행중       14
Name: count, dtype: int64

In [7]:
# 범주형데이터 기술통계
df1.describe(include=object)

# top : 최빈값 , freq : 최빈값 개수

Unnamed: 0,Sales_Type,Contract_Type,Channel,Datetime,Payment_Type,Product_Type,Customer_Type,Address1,Address2,State,Overdue_Type,Gender,Bank
count,51301,51301,51301,51301,51301,51301,51299,51299,51299,51301,51301,51301,48542
unique,2,9,16,577,5,6,2,8,14,4,2,2,47
top,렌탈,프로모션계약,영업방판,2019-01-31,CMS,DES-1,개인,경기도,경기도,계약확정,없음,여자,롯데카드
freq,46483,15811,23767,1167,32825,39133,46263,18353,14883,50620,49110,35602,9516


## 데이터전처리
    - 데이터 추출
    - 데이터 정렬
    - 데이터 요약
    - 데이터 필터
    - 날짜 데이터 처리
    - 데이터 병합 / 재구조화
    - 결측값

In [8]:
# 데이터 추출 (행 단위 추출 / 열 단위 추출)

df1.head() # 상위 n개의 데이터 추출
df1.tail() # 하위 n개의 데이터 추출

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
51296,51298,66579515,렌탈,프로모션계약,대형마트A,2019-03-01,60,CMS,DES-3A,96900,개인,47.0,경기도,경기도,계약확정,0,없음,남자,,기업은행
51297,51299,66799558,렌탈,일반계약,대형마트A,2019-04-01,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,8.0,새마을금고
51298,51300,66799197,렌탈,프로모션계약,영업방판,2019-04-01,39,카드이체,ERA,120900,개인,65.0,서울특별시,서울특별시,계약확정,0,없음,여자,1.0,롯데카드
51299,51301,66792778,렌탈,일반계약,홈쇼핑/방송,2020-02-06,60,카드이체,DES-1,96900,개인,54.0,서울특별시,서울특별시,계약확정,0,없음,여자,2.0,롯데카드
51300,51302,66799607,렌탈,일반계약,홈쇼핑/방송,2019-04-24,60,CMS,DES-1,96900,개인,53.0,서울특별시,서울특별시,계약확정,0,없음,여자,8.0,신한은행


In [9]:
# 행 단위 추출
df1.iloc[100:110] # iloc (index location)

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
100,101,66758264,렌탈,패키지계약,홈쇼핑/방송,2019-01-01,60,CMS,DES-1,81900,개인,38.0,전라도,광주광역시,계약확정,0,없음,남자,2.0,광주은행
101,102,66758264,렌탈,패키지계약,홈쇼핑/방송,2019-02-09,60,CMS,DES-2,81900,개인,38.0,전라도,광주광역시,계약확정,0,없음,남자,2.0,광주은행
102,103,66758918,렌탈,일반계약,전문매장H,2019-01-31,60,CMS,DES-1,96900,개인,36.0,전라도,광주광역시,계약확정,0,없음,여자,1.0,국민은행
103,104,66437196,렌탈,프로모션계약,영업방판,2019-01-31,60,CMS,DES-3A,90900,개인,44.0,전라도,광주광역시,계약확정,0,없음,여자,2.0,농협회원조합
104,105,66420849,렌탈,프로모션계약,영업방판,2019-02-21,60,CMS,DES-2,90900,개인,50.0,전라도,광주광역시,계약확정,0,없음,남자,3.0,외환은행
105,106,66758580,렌탈,패키지계약,영업방판,2019-06-11,60,CMS,DES-1,134700,개인,53.0,전라도,광주광역시,해약확정,0,없음,남자,2.0,농협중앙회
106,107,66282355,렌탈,교체계약,영업방판,2019-07-14,60,CMS,DES-1,96900,개인,63.0,경기도,경기도,계약확정,0,없음,여자,1.0,우리은행
107,108,66758599,렌탈,패키지계약,홈쇼핑/방송,2019-01-01,60,CMS,DES-1,81900,개인,46.0,경기도,경기도,계약확정,0,없음,남자,3.0,신한은행
108,109,66758599,렌탈,패키지계약,홈쇼핑/방송,2019-06-22,60,CMS,DES-2,81900,개인,46.0,경기도,경기도,계약확정,0,없음,남자,3.0,신한은행
109,110,25742697,렌탈,일반계약,영업방판,2019-08-16,60,CMS,DES-1,96900,사업자,42.0,경기도,경기도,계약확정,0,없음,여자,,국민은행


함수 기능을 사용할 때 : 소괄호()

데이터 구조 자체에서 접근할 때 : 대괄호[] df1.iloc[n:n] or 소괄호 사용X df1.columus

In [None]:
# 열 단위 추출
df1['Amount_Month'] # 하나의 열을 확인할 때 --> 시리즈형태로 추출

In [None]:
df1[['Amount_Monte','State']] # 하나 이상 열 확인할 때 --> DataFrame으로 추출

In [11]:
df1[['Amount_Month','State']].head(10)

Unnamed: 0,Amount_Month,State
0,96900,계약확정
1,102900,계약확정
2,96900,계약확정
3,66900,계약확정
4,66900,기간만료
5,90900,계약확정
6,98400,계약확정
7,80400,계약확정
8,102900,계약확정
9,105900,계약확정


In [12]:
# 정렬
df1.sort_values(by='Amount_Month') # 오름차순, default
df1.sort_values(by='Amount_Month', ascending=False) # 내림차순

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
19773,19775,25740247,렌탈,프로모션계약,전단홍보,2019-11-14,39,가상계좌,MMC,215700,사업자,,경상도,경상도,계약확정,0,없음,여자,,
28147,28149,25704137,렌탈,프로모션계약,전단홍보,2019-07-25,39,CMS,MMC,215700,사업자,,경상도,경상도,계약확정,0,없음,여자,,국민은행
19413,19415,25710320,렌탈,프로모션계약,영업방판,2020-01-29,39,CMS,MMC,215700,사업자,,경기도,경기도,계약확정,0,없음,여자,,기업은행
19412,19414,25710321,렌탈,프로모션계약,영업방판,2019-09-15,39,CMS,MMC,215700,사업자,,경기도,경기도,계약확정,0,없음,여자,,기업은행
37522,37524,25733781,렌탈,프로모션계약,영업방판,2020-07-02,39,CMS,MMC,215700,사업자,,경기도,경기도,계약확정,0,없음,여자,,기업은행
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45304,45306,66793756,멤버십,멤버십2유형,전문매장H,2019-10-07,36,무통장,DES-1,54603,개인,,충청도,충청도,계약확정,0,없음,여자,,
50428,50430,66797402,멤버십,멤버십2유형,전문매장H,2019-12-27,36,무통장,DES-1,54603,개인,,경기도,경기도,계약확정,0,없음,여자,,
45302,45304,66793342,멤버십,멤버십2유형,전문매장H,2019-04-03,36,무통장,DES-1,54603,개인,,강원도,강원도,계약확정,0,없음,여자,,
41153,41155,66790864,멤버십,멤버십2유형,전문매장H,2019-02-16,36,무통장,DES-1,54603,개인,,경기도,경기도,계약확정,0,없음,여자,,


In [None]:
# 월렌탈 비용이 높은 상위 100명의 계약명단을 뽑아 엑셀로 저장
df1.sort_values(by='Amount_Month',ascending=False).head(100).to_excel('result.xlsx')

In [16]:
# 데이터 필터(특정 조건에 해당하는 데이터를 추출)
# 층별화 : 범주화 데이터 또는 연속형 데이터의 특정 구간에 따라 테이터의 통계적 특성이 달라질 수 있음

# 월렌탈 비용이 10만원 이상인 명단을 추출
cond1 = df1['Amount_Month']>=100000
df1.loc[cond1]

# df1[df1['Amount_Month']>=100000] 도 가능하지만 조건이 많아질 때 코드 더럽


Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,2.0,현대카드
8,9,66758007,렌탈,일반계약,영업방판,2019-06-30,60,카드이체,DES-3A,102900,개인,62.0,경상도,경상도,계약확정,0,없음,여자,2.0,롯데카드
9,10,66756702,렌탈,프로모션계약,영업방판,2019-06-30,60,CMS,DES-1,105900,개인,51.0,경상도,경상도,계약확정,0,없음,여자,1.0,신한은행
19,20,66755490,렌탈,프로모션계약,영업방판,2020-01-17,60,카드이체,DES-1,105900,개인,58.0,경상도,부산광역시,계약확정,0,없음,여자,1.0,롯데카드
21,22,66758336,렌탈,프로모션계약,전문매장Z,2020-04-25,60,CMS,DES-1,111900,개인,72.0,경상도,부산광역시,계약확정,0,없음,남자,9.0,농협회원조합
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51286,51288,66799302,렌탈,프로모션계약,영업방판,2019-05-21,60,카드이체,DES-1,105900,개인,39.0,서울특별시,서울특별시,계약확정,0,없음,여자,1.0,롯데카드
51287,51289,66444051,렌탈,교체계약,대형마트A,2019-09-29,60,CMS,DES-1,102900,개인,90.0,서울특별시,서울특별시,계약확정,0,없음,남자,,신한은행
51288,51290,66799492,렌탈,프로모션계약,영업방판,2019-02-24,60,카드이체,DES-1,105900,개인,52.0,경기도,경기도,계약확정,0,없음,남자,,삼성카드
51291,51293,25731743,렌탈,일반계약,영업방판,2020-02-12,60,CMS,DES-3A,102900,사업자,,경기도,경기도,계약확정,10,있음,여자,,농협중앙회


In [17]:
# 월렌탈 비용이 10만원 이상이면서 계약기간이 60개월 이상인 고객 명단 추출

cond1 = df1['Amount_Month']>=100000
cond2 = df1['Term'] >= 60 
df1[cond1&cond2] # 두 조건 모두 만족하는 경우
df1[cond1|cond2] # 두 조건 중 하나이상 

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,2.0,현대카드
8,9,66758007,렌탈,일반계약,영업방판,2019-06-30,60,카드이체,DES-3A,102900,개인,62.0,경상도,경상도,계약확정,0,없음,여자,2.0,롯데카드
9,10,66756702,렌탈,프로모션계약,영업방판,2019-06-30,60,CMS,DES-1,105900,개인,51.0,경상도,경상도,계약확정,0,없음,여자,1.0,신한은행
19,20,66755490,렌탈,프로모션계약,영업방판,2020-01-17,60,카드이체,DES-1,105900,개인,58.0,경상도,부산광역시,계약확정,0,없음,여자,1.0,롯데카드
21,22,66758336,렌탈,프로모션계약,전문매장Z,2020-04-25,60,CMS,DES-1,111900,개인,72.0,경상도,부산광역시,계약확정,0,없음,남자,9.0,농협회원조합
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51279,51281,66798237,렌탈,패키지계약,직영계열사A,2019-03-09,60,CMS,DES-3A,101400,개인,,경기도,인천광역시,계약확정,0,없음,여자,4.0,국민은행
51286,51288,66799302,렌탈,프로모션계약,영업방판,2019-05-21,60,카드이체,DES-1,105900,개인,39.0,서울특별시,서울특별시,계약확정,0,없음,여자,1.0,롯데카드
51287,51289,66444051,렌탈,교체계약,대형마트A,2019-09-29,60,CMS,DES-1,102900,개인,90.0,서울특별시,서울특별시,계약확정,0,없음,남자,,신한은행
51288,51290,66799492,렌탈,프로모션계약,영업방판,2019-02-24,60,카드이체,DES-1,105900,개인,52.0,경기도,경기도,계약확정,0,없음,남자,,삼성카드


### 데이터전처리

In [5]:
cond1 = df1['Channel'] == '대형마트A'
df1.loc[cond1]

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
1342,1343,66423896,멤버십,멤버십3유형,대형마트A,2019-01-05,12,CMS,DES-1,66900,개인,65.0,경상도,경상도,계약확정,0,없음,여자,4.0,농협회원조합
3658,3659,66423932,멤버십,멤버십3유형,대형마트A,2019-10-03,12,CMS,DES-1,66900,개인,46.0,경기도,경기도,계약확정,0,없음,여자,3.0,농협중앙회
3762,3763,66424323,멤버십,멤버십3유형,대형마트A,2019-12-20,12,CMS,DES-1,66900,개인,48.0,경기도,경기도,계약확정,0,없음,남자,2.0,농협회원조합
5255,5256,25701294,멤버십,멤버십3유형,대형마트A,2019-05-03,12,CMS,DES-1,66900,사업자,,경상도,대구광역시,계약확정,0,없음,여자,,농협중앙회
7059,7060,66425806,멤버십,멤버십3유형,대형마트A,2019-04-28,12,CMS,DES-1,66900,개인,39.0,경상도,부산광역시,계약확정,0,없음,여자,2.0,부산은행
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51289,51291,66441028,렌탈,교체계약,대형마트A,2020-01-16,60,CMS,DES-2,78900,개인,56.0,경기도,경기도,계약확정,0,없음,남자,1.0,롯데카드
51293,51295,66799369,렌탈,일반계약,대형마트A,2020-07-10,60,CMS,DES-1,96900,개인,74.0,경기도,경기도,계약확정,0,없음,여자,1.0,국민은행
51294,51296,66796007,렌탈,프로모션계약,대형마트A,2019-03-11,60,카드이체,DES-3A,90900,개인,43.0,경기도,경기도,계약확정,0,없음,남자,,BC카드
51296,51298,66579515,렌탈,프로모션계약,대형마트A,2019-03-01,60,CMS,DES-3A,96900,개인,47.0,경기도,경기도,계약확정,0,없음,남자,,기업은행


In [9]:
df1['Channel'].value_counts()


Channel
영업방판       23767
홈쇼핑/방송     12977
대형마트A       4725
전문매장H       3227
재계약         2193
대형마트E       1497
전문매장Z       1206
전단홍보         560
홈쇼핑/인터넷      375
대형마트H        225
직영유통사        188
자체홈페이지       120
대형마트C        119
대형마트N         83
직영계열사A        20
직영계열사B        19
Name: count, dtype: int64

In [None]:
# isin() : 특정 리스트 내에 있는 값이 포함되어 있으면 True, 없으면 False
df1['Channel'].isin(['영업판매','대형마트A','홈쇼핑/방송'])

In [10]:
df1['Channel'].value_counts().index[0:3]

Index(['영업방판', '홈쇼핑/방송', '대형마트A'], dtype='object', name='Channel')

In [12]:
cond1 = df1['Channel'].isin(df1['Channel'].value_counts().index[0:3])
df1[cond1]

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
0,1,66758234,렌탈,일반계약,영업방판,2019-05-06,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,9.0,새마을금고
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,2.0,현대카드
2,3,66756657,렌탈,일반계약,홈쇼핑/방송,2019-02-28,60,CMS,DES-1,96900,개인,48.0,경기도,경기도,계약확정,0,없음,여자,8.0,우리은행
5,6,66165241,렌탈,프로모션계약,영업방판,2019-07-29,60,CMS,DES-3A,90900,개인,53.0,경상도,경상도,계약확정,0,없음,여자,5.0,농협중앙회
6,7,66751017,렌탈,패키지계약,홈쇼핑/방송,2019-01-30,60,카드이체,DES-1,98400,개인,66.0,경상도,경상도,계약확정,0,없음,남자,1.0,롯데카드
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51296,51298,66579515,렌탈,프로모션계약,대형마트A,2019-03-01,60,CMS,DES-3A,96900,개인,47.0,경기도,경기도,계약확정,0,없음,남자,,기업은행
51297,51299,66799558,렌탈,일반계약,대형마트A,2019-04-01,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,8.0,새마을금고
51298,51300,66799197,렌탈,프로모션계약,영업방판,2019-04-01,39,카드이체,ERA,120900,개인,65.0,서울특별시,서울특별시,계약확정,0,없음,여자,1.0,롯데카드
51299,51301,66792778,렌탈,일반계약,홈쇼핑/방송,2020-02-06,60,카드이체,DES-1,96900,개인,54.0,서울특별시,서울특별시,계약확정,0,없음,여자,2.0,롯데카드


In [13]:
# 데이터 요약
df1.pivot_table(index='Product_Type', values='Amount_Month',aggfunc='mean')

Unnamed: 0_level_0,Amount_Month
Product_Type,Unnamed: 1_level_1
DES-1,94424.545166
DES-2,87556.390618
DES-3A,94706.900529
DES-R4,129444.954128
ERA,119744.720497
MMC,189450.0


In [14]:
# 제품군 별, 계약 유형 별 월렌탈비용의 평균
df1.pivot_table(index=['Sales_Type','Product_Type'], values='Amount_Month',aggfunc='mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,Amount_Month
Sales_Type,Product_Type,Unnamed: 2_level_1
렌탈,DES-1,98694.923613
렌탈,DES-2,87901.899247
렌탈,DES-3A,94752.196532
렌탈,DES-R4,130616.25
렌탈,ERA,119744.720497
렌탈,MMC,189450.0
멤버십,DES-1,63161.70657
멤버십,DES-2,57725.242718
멤버십,DES-3A,75900.0
멤버십,DES-R4,75900.0


In [17]:
# 제품군 별, 계약 유형 별,(행) 날짜별(열) 월렌탈비용의 평균
df1.pivot_table(index=['Sales_Type','Product_Type'], 
                columns='Datetime', values='Amount_Month',aggfunc=['mean','sum'],fill_value=0, margins=True)

# index : 구분하고자하는 범주형 데이터
# columns : 구분하고자하는 범주형 데이터
# values : 계산하고자하는 연속형 데이터
# aggfunc : 계산하고자하는 통계량
# fill_value : 결측값 대신 넣을 값
# margins : 모든 통계량의 합

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Datetime,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06,2019-01-07,2019-01-08,2019-01-09,2019-01-10,...,2020-07-30,2020-07-31,2020-08-01,2020-08-02,2020-08-03,2020-08-04,2020-08-05,2020-08-06,2020-08-07,All
Sales_Type,Product_Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
렌탈,DES-1,96512.903226,100182.857143,97735.714286,100041.666667,98688.75,98118.75,97832.307692,97435.227273,98835.714286,99323.529412,...,5973000,7940700,4685700,2247300,2157300,2126400,2089200,1992600,1746600,3398066220
렌탈,DES-2,90175.0,86400.0,87115.384615,84207.692308,86427.272727,86169.230769,84983.333333,87150.0,90000.0,85164.705882,...,1033800,1761000,1314000,531900,554400,515400,697260,439500,87900,781711590
렌탈,DES-3A,90900.0,93300.0,94900.0,90900.0,90900.0,0.0,90900.0,102900.0,94900.0,90900.0,...,278700,296700,187800,0,0,90900,0,90900,90900,196705560
렌탈,DES-R4,0.0,0.0,0.0,126900.0,0.0,0.0,0.0,0.0,0.0,135900.0,...,144900,0,0,0,0,0,126900,0,0,41797200
렌탈,ERA,0.0,120900.0,120900.0,120900.0,0.0,0.0,120900.0,0.0,120900.0,120900.0,...,120900,362700,0,0,241800,0,120900,0,0,77115600
렌탈,MMC,0.0,0.0,0.0,0.0,0.0,185700.0,0.0,185700.0,0.0,0.0,...,0,0,0,0,0,0,0,0,185700,22734000
멤버십,DES-1,69900.0,66900.0,66900.0,66900.0,66900.0,71400.0,66900.0,0.0,66900.0,66900.0,...,383322,730365,671079,188403,192039,125643,245382,247758,58743,297049506
멤버십,DES-2,0.0,0.0,55200.0,0.0,0.0,0.0,55200.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,5945700
멤버십,DES-3A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,379500
멤버십,DES-R4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,531300


In [20]:
df1['Datetime_dt'] = pd.to_datetime(df1['Datetime'])
df1

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,...,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank,Datetime_dt
0,1,66758234,렌탈,일반계약,영업방판,2019-05-06,60,CMS,DES-1,96900,...,42.0,경기도,경기도,계약확정,0,없음,여자,9.0,새마을금고,2019-05-06
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,...,39.0,경기도,경기도,계약확정,0,없음,남자,2.0,현대카드,2020-02-20
2,3,66756657,렌탈,일반계약,홈쇼핑/방송,2019-02-28,60,CMS,DES-1,96900,...,48.0,경기도,경기도,계약확정,0,없음,여자,8.0,우리은행,2019-02-28
3,4,66423450,멤버십,멤버십3유형,재계약,2019-05-13,12,CMS,DES-1,66900,...,39.0,경기도,경기도,계약확정,0,없음,남자,5.0,농협회원조합,2019-05-13
4,5,66423204,멤버십,멤버십3유형,재계약,2019-05-10,12,CMS,DES-1,66900,...,60.0,경기도,경기도,기간만료,12,있음,남자,8.0,농협회원조합,2019-05-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51296,51298,66579515,렌탈,프로모션계약,대형마트A,2019-03-01,60,CMS,DES-3A,96900,...,47.0,경기도,경기도,계약확정,0,없음,남자,,기업은행,2019-03-01
51297,51299,66799558,렌탈,일반계약,대형마트A,2019-04-01,60,CMS,DES-1,96900,...,42.0,경기도,경기도,계약확정,0,없음,여자,8.0,새마을금고,2019-04-01
51298,51300,66799197,렌탈,프로모션계약,영업방판,2019-04-01,39,카드이체,ERA,120900,...,65.0,서울특별시,서울특별시,계약확정,0,없음,여자,1.0,롯데카드,2019-04-01
51299,51301,66792778,렌탈,일반계약,홈쇼핑/방송,2020-02-06,60,카드이체,DES-1,96900,...,54.0,서울특별시,서울특별시,계약확정,0,없음,여자,2.0,롯데카드,2020-02-06


In [21]:
df1['Year'] = df1['Datetime_dt'].dt.year # 데이터구조에서 가져오는 것이므로 소괄호 없쥬
df1['Month'] = df1['Datetime_dt'].dt.month
df1['Day_of_Week'] = df1['Datetime_dt'].dt.day_name() # 데이터가져와서 변환(요일로)해아하닌깐 소괄호 있쥬
df1['Week'] = df1['Datetime_dt'].dt.isocalendar().week
df1.head(2)

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,...,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank,Datetime_dt,Year,Month,Day_of_Week,Week
0,1,66758234,렌탈,일반계약,영업방판,2019-05-06,60,CMS,DES-1,96900,...,0,없음,여자,9.0,새마을금고,2019-05-06,2019,5,Monday,19
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,...,0,없음,남자,2.0,현대카드,2020-02-20,2020,2,Thursday,8


In [28]:
# 파생변수 생성
# 파생변수 : 기존에 있는 데이터(항목)을 이용해 계산한 새로운 항목
df1['Total_Amount'] = df1['Term'] * df1['Amount_Month']

In [29]:
df1.pivot_table(index='Product_Type', columns='Year',values='Total_Amount',aggfunc='sum')

Year,2019,2020
Product_Type,Unnamed: 1_level_1,Unnamed: 2_level_1
DES-1,167173962120,42612206976
DES-2,35856775800,8887944960
DES-3A,9361251360,2384535600
DES-R4,2007975600,506232000
ERA,2454835500,552672900
MMC,684063900,202562100


In [None]:
# loc 함수 활용해 파생 변수 생성
# loc[조건, '파생변수명'] = '입력하고자 하는 값'
cond2 = df1['Age'] >= 50
cond1 = df1['Age'] >= 60


## 순서중요 (수정이 되므로)
# 
df1.loc[cond1, '연령대'] = '60대이상'
df1.loc[cond2, '연령대'] = '50대'

array([ 42.,  39.,  48.,  60.,  53.,  66.,  62.,  51.,  35.,  50.,  59.,
        61.,  32.,  87.,  58.,  43.,  72.,  nan,  40.,  73.,  38.,  57.,
        63.,  29.,  65.,  37.,  34.,  67.,  27.,  55.,  41.,  69.,  36.,
        56.,  68.,  77.,  83.,  54.,  52.,  49.,  44.,  46.,  79.,  45.,
        75.,  74.,  47.,  76.,  31.,  64.,  71.,  28.,  33.,  30.,  80.,
        88.,  70.,  82.,  90.,  92.,  84.,  85.,  81.,  26.,  86.,  78.,
        89.,  91.,  96.,  99.,  94.,  93., 102.,  95.,  25.])

In [7]:
# apply 구문을 이용해 파생 변수 생성
# 한줄씩 가지고와서 

# serise --> 데이터의구조 즉,상자
# object, str, int --> 데이터의타입


def func2(row) :
    
    if pd.isnull(row) : # NaN도 str형으로 변환해서 'n대'로 return하기 때문
        return row
    else : return str(row)[0] + '0대'

    # if row is Null : row는 하나의 데이터값이므로 is null이 안됨
    # if row == 'nan' : null값은 float형이므로 str형이 안됨
    # if row == np.nan : numpy에서의 null과 pandas에서의 null이 호환이 안되는것


df1['연령대_apply3'] = df1['Age'].apply(func2)
df1['연령대_apply3'].value_counts()

연령대_apply3
40대    16283
50대    12522
n0대     6972
30대     6798
60대     5947
70대     1737
80대      576
20대      407
90대       57
10대        2
Name: count, dtype: int64

In [None]:
# apply 구문을 이용해 파생 변수 생성
# 한줄씩 가지고와서 
def func2(row) :
    if pd.isnull(row) : # NaN도 str형으로 변환해서 'n대'로 return하기 때문
        return row
    else : return str(row)[0] + '0대'

df1['연령대_apply'] = df1['Age'].apply(func2)
df1['연령대_apply'].value_counts()