<a href="https://colab.research.google.com/github/sjungmin98/study_data_analytics/blob/main/docs/quests/MLs/RentalCarOfContactType.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 데이터

In [1]:
import pandas as pd
df_RCOCT = pd.read_csv('RentalCarOfContractType.csv')
df_RCOCT[:2]

Unnamed: 0,id,type_of_contract,type_of_contract2,channel,datetime,Term,payment_type,product,amount,state,overdue_count,overdue,credit rating,bank,cancellation,age,Mileage
0,66758234,렌탈,Normal,서비스 방문,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,9.0,새마을금고,정상,43.0,1862.0
1,66755948,렌탈,Extension_Rental,서비스 방문,2019-10-20,60,카드이체,K1,102900,계약확정,0,없음,2.0,현대카드,정상,62.0,2532.0


### features, target 선택
- target : 'age'
- featrues : 'amount', 'overdue_count'

### 데이터 전처리(Pre-Processing)

In [2]:
df_RCOCT = df_RCOCT.dropna(subset=['amount', 'overdue_count'])

In [3]:
train_data = df_RCOCT[df_RCOCT['age'].notna()]

In [4]:
df_RCOCT_extract = df_RCOCT.loc[:, ['amount', 'overdue_count', 'age']]
df_RCOCT_extract[:2]

Unnamed: 0,amount,overdue_count,age
0,96900,0,43.0
1,102900,0,62.0


In [5]:
# 결측치와 이상치 확인
df_RCOCT_extract.isnull().sum()
# 공백 여부 확인 필요

amount               0
overdue_count        0
age              10795
dtype: int64

### 데이터 분리
- target과 labels(festures) 분리
- train과 test set 분리

In [6]:
features = train_data[['amount', 'overdue_count']]
target = train_data['age']

target.shape, features.shape

((40509,), (40509, 2))

In [7]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(
                                features, target, test_size=0.3, random_state=2)
features_train.shape, features_test.shape, target_train.shape, target_test.shape

((28356, 2), (12153, 2), (28356,), (12153,))

In [8]:
features_train = features_train.dropna()
features_test = features_test.dropna()
target_train = target_train.dropna()
features_train.shape, features_test.shape, target_train.shape

((28356, 2), (12153, 2), (28356,))

## 모델

### 목표변수 따른 모델 선택 - 연속형

In [9]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [10]:
# 학습 진행
model.fit(features_train, target_train)

### 결측치 채우기

In [11]:
missing_age_index = df_RCOCT[df_RCOCT['age'].isnull()].index
missing_age_features = df_RCOCT.loc[missing_age_index, ['amount', 'overdue_count']]

In [12]:
predicted_age = model.predict(missing_age_features)

In [24]:
df_RCOCT.loc[missing_age_index, 'age'] = predicted_age
predicted_age

array([46.62089417, 46.53443319, 46.60030822, ..., 46.57560508,
       46.68517996, 46.57560508])

## 평가

In [19]:
from sklearn.metrics import r2_score
# train data set
target_train_predict = model.predict(features_train)
target_train.shape, target_train_predict.shape  # 원래 정답과 학습 후 정답

((28356,), (28356,))

In [20]:
# train data set에 대한 평가
r2_score(target_train, target_train_predict)

4.722363321685297e-05

In [25]:
round(4.722363321685297e-05)

0

In [26]:
target_train_predict = model.predict(features_test)
target_test.shape, target_train_predict.shape

((12153,), (12153,))

In [27]:
r2_score(target_test, target_train_predict)

-0.0002949132366203422

## 서비스
- 사용자 입력 시 주의 사항 : 학습 때 사용한 포맷을 그대로 유지

In [32]:
amount_value = df_RCOCT.loc[5, 'amount']
overdue_count_value = df_RCOCT.loc[5, 'overdue_count']
amount_value, overdue_count_value

(90900, 0)

In [35]:
# 사용자 입력 [['amount', 'overdue_count']]
model.predict([[90900, 0]])



array([46.59207384])

In [37]:
# 재사용 위해 model을 파일로 저장
import pickle  # 메모리 인스턴스 클래스를 이전파일로 저장

In [38]:
with open('RentalCarOfContactType.pkl', 'wb') as pickle_file:
  pickle.dump(obj=model, file=pickle_file)

In [39]:
df_RCOCT.to_csv('LetalCarOfContractType_age_nonnull.csv', index=False, encoding='utf-8-sig')