<a href="https://colab.research.google.com/github/nohjuhyeon/study_AIs/blob/main/docs/quests/MLs/RentalOfContractType.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 지도학습(Supervised Learning)
- 목표 변수(target, label, Y) 존재
- 설명 변수(feature, x)

## 데이터

In [181]:
import pandas as pd
df_ROCT = pd.read_csv('/content/RentalCarOfContractType.csv')
df_ROCT

Unnamed: 0,id,type_of_contract,type_of_contract2,channel,datetime,Term,payment_type,product,amount,state,overdue_count,overdue,credit rating,bank,cancellation,age,Mileage
0,66758234,렌탈,Normal,서비스 방문,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,9.0,새마을금고,정상,43.0,1862.0
1,66755948,렌탈,Extension_Rental,서비스 방문,2019-10-20,60,카드이체,K1,102900,계약확정,0,없음,2.0,현대카드,정상,62.0,2532.0
2,66756657,렌탈,Normal,홈쇼핑/방송,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,8.0,우리은행,정상,60.0,2363.0
3,66423450,멤버십,TAS,렌탈재계약,2019-10-20,12,CMS,K1,66900,계약확정,0,없음,5.0,농협은행,정상,60.0,2449.0
4,66423204,멤버십,TAS,렌탈재계약,2019-10-20,12,CMS,K1,66900,해약확정,12,있음,8.0,농협은행,해약,51.0,1942.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51299,66579515,,Promotion,R관리방판,2020-02-03,60,무통장,K3,96900,계약확정,0,없음,,롯데카드,정상,,
51300,66799558,렌탈,Normal,영업방판,2020-02-03,60,카드이체,K1,96900,해약확정,0,없음,8.0,롯데카드,해약,39.0,1753.0
51301,66799197,렌탈,Promotion,홈쇼핑/방송,2020-02-03,39,무통장,K5,120900,해약확정,0,없음,1.0,,해약,51.0,2217.0
51302,66792778,,Normal,렌탈총판,2020-02-03,60,카드이체,K1,96900,계약확정,0,없음,2.0,신한카드,정상,64.0,2588.0


In [182]:
df_ROCT.isnull().sum()

id                       0
type_of_contract         4
type_of_contract2        1
channel                  0
datetime                 0
Term                     0
payment_type             0
product                  1
amount                   0
state                    0
overdue_count            0
overdue                  2
credit rating         8783
bank                  2760
cancellation            25
age                  10795
Mileage              10795
dtype: int64

### features,target 선택
- target : age
- features : amount, overdue_count, credit rating

### 데이터 전처리(Pre-Processing)

In [183]:
# 컬럼 추출
df_ROCT_extract = df_ROCT.loc[:,['age','amount','overdue_count']]
df_ROCT_extract[:2]

Unnamed: 0,age,amount,overdue_count
0,43.0,96900,0
1,62.0,102900,0


In [184]:
# 결측치와 이상치 확인
df_ROCT_extract.isnull().sum()

age              10795
amount               0
overdue_count        0
dtype: int64

In [185]:
df_ROCT_extract = df_ROCT_extract.dropna(subset=['age']).copy()
df_ROCT_extract.isnull().sum()

age              0
amount           0
overdue_count    0
dtype: int64

In [186]:
# 공백 여부 확인 : X
condition_age = "age == ''"
condition_amount = "amount ==''"
condition_overdue = "overdue_count ==''"
conditions = f"{condition_age} or {condition_amount} or {condition_overdue}"
df_ROCT_extract.query(conditions)

Unnamed: 0,age,amount,overdue_count


### 데이터 분리 : train과 test set 분리
- target과 labels(features)분리
- train 과 test set 분리

In [187]:
target = df_ROCT_extract['age']
features = df_ROCT_extract[['amount', 'overdue_count']]
target.shape,features.shape

((40509,), (40509, 2))

In [188]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features,target,test_size=0.3,random_state=2)
features_train.shape,features_test.shape,target_train.shape,target_test.shape
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

((28356, 2), (12153, 2), (28356,), (12153,))

## 모델

### 목표변수 따른 모델 선택 - 연속형
- 목표변수
- 설명변수

In [189]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [190]:
# 학습 진행
model.fit(features_train,target_train)

## 평가

In [191]:
from sklearn.metrics import r2_score
target_train_predict = model.predict(features_train)
target_train.shape,target_train_predict.shape # 원래 정답과 학습 후 정답
# https://scikit-learn.org/0.17/modules/generated/sklearn.metrics.r2_score.html
# r2_score 사각형의 넓이를 갖고 평가

((28356,), (28356,))

In [192]:
# train data set에 대한 평가
r2_score(target_train, target_train_predict)

4.722363321685297e-05

In [193]:
target_test_predict = model.predict(features_test)
target_test.shape,target_test_predict.shape

((12153,), (12153,))

In [194]:
r2_score(target_test,target_test_predict)

-0.0002949132366203422

## 서비스
- 사용자 입력 시 주의사항 : 학습 때 사용한 포멧을 그대로 유지


In [195]:
df_ROCT[df_ROCT['age'].isna()]['amount'].iloc[0]

80400

In [196]:
df_ROCT['age'].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
51299     True
51300    False
51301    False
51302    False
51303    False
Name: age, Length: 51304, dtype: bool

In [197]:
# 사용자 입력 :  [['amount','overdue_count','credit_rating']]
# model.predict([df_ROCT[df_ROCT['age'].isna()]['amount'].loc[0],df_ROCT[df_ROCT['age'].isna()]['overdue_count'][0],df_ROCT[df_ROCT['age'].isna()]['credit_rating']][0])
model.predict(df_ROCT[df_ROCT['age'].isna()][['amount','overdue_count']])
df_ROCT.loc[df_ROCT['age'].isna(),'age'] = model.predict(df_ROCT[df_ROCT['age'].isna()][['amount','overdue_count']])
df_ROCT.loc[df_ROCT['age'].isna(),'age']

Series([], Name: age, dtype: float64)

In [198]:
df_ROCT.isnull().sum()

id                       0
type_of_contract         4
type_of_contract2        1
channel                  0
datetime                 0
Term                     0
payment_type             0
product                  1
amount                   0
state                    0
overdue_count            0
overdue                  2
credit rating         8783
bank                  2760
cancellation            25
age                      0
Mileage              10795
dtype: int64

In [126]:
# 재 사용 위해 model을 파일로 저장
import pickle # 메모리 인스턴스클래스를 이진파일로 저장

In [206]:
with open('RentalCarOfContractType.pkl','wb') as pickle_file:
  pickle.dump(obj=model,file=pickle_file)

In [207]:
df_ROCT.to_csv('RentalCarOfContractType_ageRegression.csv',index=False,encoding='utf-8')