# 2유형 머신러닝

### 아답터 2유형 풀이
- https://www.youtube.com/watch?v=QtWhHCuVIxA

In [None]:
# 출력을 원하실 경우 print() 함수 활용
# 예시) print(df.head())

# getcwd(), chdir() 등 작업 폴더 설정 불필요
# 파일 경로 상 내부 드라이브 경로(C: 등) 접근 불가

import pandas as pd

train = pd.read_csv("data/customer_train.csv")
test = pd.read_csv("data/customer_test.csv")

# 사용자 코딩
# 빅데이터 실기 예제 : 예측 총 구매금액

# 전처리
X = train.drop(['총구매액'], axis = 1)
y = train['총구매액']

X_full = pd.concat([X, test], axis=0)
X_full = X_full.drop(['회원ID'], axis=1)

print(X_full.shape)

# 결측치 처리
X_full['환불금액'] = X_full['환불금액'].fillna(0)

# 랜덤포레스트의 경우 생략함

# 범주형 변수 인코딩 시 get_dummies
X_full = pd.get_dummies(X_full)
print(X_full.shape)

# 데이터 분리
X_train = X_full[:train.shape[0]]
X_test = X_full[train.shape[0]:]
print(X_train.shape, X_test.shape)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# 모델 학습 및 검증
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

# 평가
from sklearn.metrics import root_mean_squared_error, r2_score
rmse = root_mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print(rmse, r2)

# 결과
y_pred = model.predict(X_test)
result = pd.DataFrame(y_pred, columns = ['pred'])
result.to_csv('result.csv', index=False)

result = pd.read_csv('result.csv')
print(result)
# 답안 제출 참고
# 아래 코드는 예시이며 변수명 등 개인별로 변경하여 활용
# pd.DataFrame변수.to_csv("result.csv", index=False)


### 퇴근후딴짓 2유형 풀이

In [None]:
# 출력을 원하실 경우 print() 함수 활용
# 예시) print(df.head())

# getcwd(), chdir() 등 작업 폴더 설정 불필요
# 파일 경로 상 내부 드라이브 경로(C: 등) 접근 불가

import pandas as pd

train = pd.read_csv("data/customer_train.csv")
test = pd.read_csv("data/customer_test.csv")

# 사용자 코딩
# 빅데이터 실기 예제 : 예측 총 구매금액

# EDA
pd.set_option('display.max_columns',None)

# 전처리
target =train.pop('총구매액')
train['환불금액'] = train['환불금액'].fillna(0)
test['환불금액'] = test['환불금액'].fillna(0)

# 선택 1. 레이블 인코딩, train 주구매상품이 test 주구매상품을 포함하기 때문에 합칠 필요는 없음
from sklearn.preprocessing import LabelEncoder
cols = ['주구매상품', '주구매지점']
for col in cols:
	le = LabelEncoder()
	train[col] = le.fit_transform(train[col])
	test[col] = le.transform(test[col])
	
	
# 선택 2. 원핫인코딩. 만약 카테고리가 다르면 합쳐서 진행이 필요함 => 그냥 일단 이걸로 한 다음에 에러가 나면 선택 1을 하는 방법
# df = pd.concat([train, test])
# df = pd.get_dummies(df)

# # concat을 사용하면 다시 분리해주어야함
# train = df.iloc[:len(train)]
# test = df.iloc[len(train):]
# print(train.shape, test.shape)

# 검증용 데이터
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state=0)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# 모델 학습및평가
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_val)

from sklearn.metrics import root_mean_squared_error, r2_score
print("rmse:", root_mean_squared_error(y_val, pred))
print("r2:", r2_score(y_val, pred))

# 제출
pred = model.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index=False)
print(pd.read_csv('result.csv'))

- 레이블 인코딩, 원핫인코딩 예측 값이 계속 바뀜..머임

### 1. 환자의 당뇨병 여부 예층
- 예측할 컬럼: Outcome 0: 정상, 1: 당뇨병
- 성능은 ROC-AUC 평가지표에 따라 채점

In [11]:

import pandas as pd 
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/diabetes_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/diabetes_test.csv")

In [12]:
# 탐색적 분석
print("train:", train.shape)
print(train.head(1))
print(train.info())

print('결측치')
print(train.isnull().sum().sum())


print("test:", test.shape)
print(test.head(1))
print(test.info())

print('결측치')
print(test.isnull().sum().sum())

print('==tartget 빈도==')
print(train['Outcome'].value_counts())

# 전처리
target = train.pop('Outcome')

# 검증 데이터 나누기
# 모두 수치형 데이터고 결측치가 없어서 따로 레이블 인코딩이나 원핫인코딩을 하지 않음
# 모두 수치형 데이터라면 스케일링을 해줄 수 있음
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val =  train_test_split(train, target, test_size=0.2, random_state=0)
print('==분할된 데이터 크기==')
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# 모델 학습
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0, max_depth = 5, n_estimators=500)
model.fit(X_train, y_train)
pred = model.predict_proba(X_val) # roc_auc_score는 확률기반이므로 predict_proba를 써야함
print("예측:", pred[:5])

from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val, pred[:,1])
print('roc-auc:', roc_auc)

pred = model.predict_proba(test)
submit = pd.DataFrame({
    'pred': pred[:,1]
})
submit.head(5)

# submit.to_csv("result.csv", index=False)


train: (614, 9)
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            1      118             58             36       94  33.3   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.261   23        0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               614 non-null    int64  
 1   Glucose                   614 non-null    int64  
 2   BloodPressure             614 non-null    int64  
 3   SkinThickness             614 non-null    int64  
 4   Insulin                   614 non-null    int64  
 5   BMI                       614 non-null    float64
 6   DiabetesPedigreeFunction  614 non-null    float64
 7   Age                       614 non-null    int64  
 8   Outcome                   614 non-null    int64  
dtypes: float64(2), int64(7)
memory usage:

Unnamed: 0,pred
0,0.183228
1,0.253687
2,0.146948
3,0.060001
4,0.075731


### 2. 이직 여부 예측

In [21]:
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/hr_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/hr_test.csv")

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11750 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15012 non-null  object 
 6   education_level         14961 non-null  object 
 7   major_discipline        13045 non-null  object 
 8   experience              15272 non-null  object 
 9   company_size            10539 non-null  object 
 10  company_type            10383 non-null  object 
 11  last_new_job            14984 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target                  15326 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [23]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3832 entries, 0 to 3831
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             3832 non-null   int64  
 1   city                    3832 non-null   object 
 2   city_development_index  3832 non-null   float64
 3   gender                  2900 non-null   object 
 4   relevent_experience     3832 non-null   object 
 5   enrolled_university     3760 non-null   object 
 6   education_level         3737 non-null   object 
 7   major_discipline        3300 non-null   object 
 8   experience              3821 non-null   object 
 9   company_size            2681 non-null   object 
 10  company_type            2635 non-null   object 
 11  last_new_job            3751 non-null   object 
 12  training_hours          3832 non-null   int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 389.3+ KB


In [24]:
# target을 맞추어야함. 범주형 데이터가 10개. 수치형은 4개. 결측치도 존재함
# 카테고리별 수량을 세보기 nunique()
train.nunique()

enrollee_id               15326
city                        123
city_development_index       93
gender                        3
relevent_experience           2
enrolled_university           3
education_level               5
major_discipline              6
experience                   22
company_size                  8
company_type                  6
last_new_job                  6
training_hours              241
target                        2
dtype: int64

In [25]:
test.nunique()

enrollee_id               3832
city                       113
city_development_index      87
gender                       3
relevent_experience          2
enrolled_university          3
education_level              5
major_discipline             6
experience                  22
company_size                 8
company_type                 6
last_new_job                 6
training_hours             235
dtype: int64

In [None]:
# 범주형이 있고, 컬럼의 수가 다르므로 train과 test를 합쳐서 원핫인코딩을 진행하고, 다시 train과 test로 나누기
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/hr_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/hr_test.csv")
target=train.pop('target')
# 결측치 처리
train = train.fillna('X')
test = test.fillna('X')

# 원핫인코딩
full = pd.concat([train, test])
full_dummies = pd.get_dummies(full)

n_train = len(train)

train = full_dummies[:n_train]
test = full_dummies[n_train:]

# 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 학습 및 평가
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

pred = model.predict_proba(X_val)
print(pred)
# roc-auc
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val, pred[:,1]) # proba를 쓰면 확률이라서 컬럼이 2개 생김김
print('roc_auc:', roc_auc)

# 결과 
pred = model.predict_proba(test)
submit = pd.DataFrame({
    'pred':pred[:,1]
})
print(submit.head(3))

[[0.65 0.35]
 [0.33 0.67]
 [0.74 0.26]
 ...
 [0.54 0.46]
 [0.96 0.04]
 [0.45 0.55]]
roc_auc: 0.7724341640093463
   pred
0  0.22
1  0.43
2  0.54


### 신용카드 신청자의 미래 신용 예측

In [None]:
# EDA 할때 결측치, 수치형/범주형, 데이터 불균형을 체크해야함
# F1 평가지표에 따라 채점점
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/creditcard_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/creditcard_test.csv")

In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25519 entries, 0 to 25518
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   25519 non-null  int64  
 1   CODE_GENDER          25519 non-null  object 
 2   FLAG_OWN_CAR         25519 non-null  object 
 3   FLAG_OWN_REALTY      25519 non-null  object 
 4   CNT_CHILDREN         25519 non-null  int64  
 5   AMT_INCOME_TOTAL     25519 non-null  float64
 6   NAME_INCOME_TYPE     25519 non-null  object 
 7   NAME_EDUCATION_TYPE  25519 non-null  object 
 8   NAME_FAMILY_STATUS   25519 non-null  object 
 9   NAME_HOUSING_TYPE    25519 non-null  object 
 10  DAYS_BIRTH           25519 non-null  int64  
 11  DAYS_EMPLOYED        25519 non-null  int64  
 12  FLAG_MOBIL           25519 non-null  int64  
 13  FLAG_WORK_PHONE      25519 non-null  int64  
 14  FLAG_PHONE           25519 non-null  int64  
 15  FLAG_EMAIL           25519 non-null 

In [29]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7591 entries, 0 to 7590
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   7591 non-null   int64  
 1   CODE_GENDER          7591 non-null   object 
 2   FLAG_OWN_CAR         7591 non-null   object 
 3   FLAG_OWN_REALTY      7591 non-null   object 
 4   CNT_CHILDREN         7591 non-null   int64  
 5   AMT_INCOME_TOTAL     7591 non-null   float64
 6   NAME_INCOME_TYPE     7591 non-null   object 
 7   NAME_EDUCATION_TYPE  7591 non-null   object 
 8   NAME_FAMILY_STATUS   7591 non-null   object 
 9   NAME_HOUSING_TYPE    7591 non-null   object 
 10  DAYS_BIRTH           7591 non-null   int64  
 11  DAYS_EMPLOYED        7591 non-null   int64  
 12  FLAG_MOBIL           7591 non-null   int64  
 13  FLAG_WORK_PHONE      7591 non-null   int64  
 14  FLAG_PHONE           7591 non-null   int64  
 15  FLAG_EMAIL           7591 non-null   i

In [33]:
import sklearn.metrics
dir(sklearn.metrics)

['ConfusionMatrixDisplay',
 'PrecisionRecallDisplay',
 'RocCurveDisplay',
 'SCORERS',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_classification',
 '_pairwise_fast',
 '_plot',
 '_ranking',
 '_regression',
 '_scorer',
 'accuracy_score',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'auc',
 'average_precision_score',
 'balanced_accuracy_score',
 'brier_score_loss',
 'calinski_harabasz_score',
 'check_scoring',
 'classification_report',
 'cluster',
 'cohen_kappa_score',
 'completeness_score',
 'confusion_matrix',
 'consensus_score',
 'coverage_error',
 'davies_bouldin_score',
 'dcg_score',
 'euclidean_distances',
 'explained_variance_score',
 'f1_score',
 'fbeta_score',
 'fowlkes_mallows_score',
 'get_scorer',
 'hamming_loss',
 'hinge_loss',
 'homogeneity_completeness_v_measure',
 'homogeneity_score',
 'jaccard_score',
 'label_ranking_average_precision_score',
 'label_rank

In [36]:
train['STATUS'].value_counts()

0    25085
1      434
Name: STATUS, dtype: int64

In [None]:
# 범주형에서 카테고리가 다르면 삭제해야하나? 성능을 위해서 삭제하는건가?
# 상관안쓰려면 원핫인코딩이 제일 편함
cols = train.select_dtypes(include='object').columns
for col in cols:
    set_train=set(train[col])
    set_test = set(test[col])
    if set_train == set_test:
        print(col, '카테고리 동일함')
    else:
        print(col, '!!! 카테고리 동일하지 않음')
        


CODE_GENDER 카테고리 동일함
FLAG_OWN_CAR 카테고리 동일함
FLAG_OWN_REALTY 카테고리 동일함
NAME_INCOME_TYPE 카테고리 동일함
NAME_EDUCATION_TYPE 카테고리 동일함
NAME_FAMILY_STATUS 카테고리 동일함
NAME_HOUSING_TYPE 카테고리 동일함
OCCUPATION_TYPE !!! 카테고리 동일하지 않음


In [None]:
# 굳이 삭제 안해도 가능한 것 같음. 한번 테스트해보기
# EDA 할때 결측치, 수치형/범주형, 데이터 불균형을 체크해야함
# F1 평가지표에 따라 채점점
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/creditcard_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/creditcard_test.csv")

target = train.pop('STATUS')

# 범주형이 있으므로 원핫인코딩
# pd.get_dummies
# 합쳤다가 다시 쪼개기
full = pd.concat([train, test])
full_dummies = pd.get_dummies(full)
n_train = len(train)
train = full_dummies[:n_train]
test = full_dummies[n_train:]
print(train.shape, test.shape)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 모델 학습 및 평가
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 200)
model.fit(X_train, y_train)
pred = model.predict(X_val)

# F1 평가
from sklearn.metrics import f1_score
print(f1_score(y_val, pred))

# 저장
pred = model.predict(test)
submit = pd.DataFrame({'pred':pred})
#submit.to_csv('result.csv', index=False)

(25519, 55) (7591, 55)
0.2758620689655172


## 다중분류

### 신용 등급 예층
- f1-macro 지표에 따라 채점

In [None]:

# 1. 문제정의
# 평가: f1 macro
# target: Credit_Score
# 최종파일: result.csv(컬럼 1개 pred)

# 2. 라이브러리 및 데이터 불러오기
import pandas as pd

# train = pd.read_csv("score_train.csv")
# test = pd.read_csv("score_test.csv")
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/score_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/score_test.csv")

In [46]:
train.shape

(4198, 21)

In [47]:
test.shape

(1499, 20)

In [48]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4198 entries, 0 to 4197
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Delay_from_due_date       4198 non-null   float64
 1   Num_of_Delayed_Payment    4198 non-null   float64
 2   Num_Credit_Inquiries      4198 non-null   float64
 3   Credit_Utilization_Ratio  4198 non-null   float64
 4   Credit_History_Age        4198 non-null   float64
 5   Payment_of_Min_Amount     4198 non-null   object 
 6   Amount_invested_monthly   4198 non-null   float64
 7   Monthly_Balance           4198 non-null   float64
 8   Credit_Mix                4198 non-null   object 
 9   Payment_Behaviour         4198 non-null   object 
 10  Age                       4198 non-null   float64
 11  Annual_Income             4198 non-null   float64
 12  Num_Bank_Accounts         4198 non-null   float64
 13  Num_Credit_Card           4198 non-null   float64
 14  Interest

In [49]:
train.isnull().sum().sum()

0

In [52]:
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/score_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/score_test.csv")
# target의 Credit_Score 인데 object임. 즉, 범주형 변수
# 이런경우 인코딩할때는 target 컬럼 제외해야함. 원핫 인코딩의 경우 컬럼 자체가 여러개 만들어짐짐. 
# 하지만 레이블 인코딩은 가능함. 다만, 0,1,2로 변경한 후 마지막 제출에서 다시 godd, standard, poor로 복원해야함
# 다행히 랜덤포레스트는 object더라도 자동으로 인식하여 인코딩 없이도 사용가능함

# 데이터 전처리
target = train.pop('Credit_Score')

train = pd.get_dummies(train)
test = pd.get_dummies(test)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_val)

from sklearn.metrics import f1_score
f1 = f1_score(y_val, pred, average='macro')
print('f1:', f1)

pred = model.predict(test)
submit = pd.DataFrame({'pred':pred})
#submit.to_csv('result.csv', index=False)

f1: 0.6958813106865384


### 약물 종류 예측

In [54]:
import pandas as pd

train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/drug_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/drug_test.csv")

# 3. 탐색적 데이터 분석(EDA)
print("===== 데이터 정보(자료형) =====")
print(train.info())

print("\n ===== train 결측치 수 =====")
print(train.isnull().sum().sum())

print("\n ===== test 결측치 수 =====")
print(test.isnull().sum().sum())

print("\n ===== train/test 카테고리별 수 =====")
print(train[['Sex', 'BP', 'Cholesterol']].nunique())
print(test[['Sex', 'BP', 'Cholesterol']].nunique())

print("\n ===== target 빈도 =====")
print(train['Drug'].value_counts())


# 4. 데이터 전처리
# 원핫인코딩
target = train.pop('Drug')
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 5. 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 6. 머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_tr, y_tr)
pred = rf.predict(X_val)

from sklearn.metrics import f1_score
f1 = f1_score(y_val, pred, average='macro')
print('\n f1-macro:', f1)

# 7. 예측 및 결과 파일 생성
pred = rf.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv("result.csv", index=False)

===== 데이터 정보(자료형) =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          100 non-null    int64  
 1   Sex          100 non-null    object 
 2   BP           100 non-null    object 
 3   Cholesterol  100 non-null    object 
 4   Na_to_K      100 non-null    float64
 5   Drug         100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB
None

 ===== train 결측치 수 =====
0

 ===== test 결측치 수 =====
0

 ===== train/test 카테고리별 수 =====
Sex            2
BP             3
Cholesterol    2
dtype: int64
Sex            2
BP             3
Cholesterol    2
dtype: int64

 ===== target 빈도 =====
DrugY    41
drugX    34
drugA    13
drugB     8
drugC     4
Name: Drug, dtype: int64

 f1-macro: 1.0


## 회귀 연습문제

In [55]:
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch8/flight_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch8/flight_test.csv")

# 학습용 데이터를 이용해 티켓 가격을 예측하는 모델을 만든 후, 평가용 데이턷에 적용해 얻은 값을 csv로 저장
# 성능은 RMSE에 따라 채점


train.describe()

Unnamed: 0,duration,days_left,price
count,10505.0,10505.0,10505.0
mean,12.225536,26.050547,20650.139838
std,7.182264,13.539947,22570.924117
min,0.83,1.0,1105.0
25%,6.75,15.0,4755.0
50%,11.25,26.0,7455.0
75%,16.17,38.0,42457.0
max,40.5,49.0,110936.0


In [61]:
# 베이스라인에서 flight는 train과 test의 데이터의 카테고리가 달라 삭제함
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch8/flight_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch8/flight_test.csv")
target = train.pop('price')

# 카테고리가 다른 경우 삭제
train = train.drop('flight', axis=1)
test = test.drop('flight', axis=1)

# 원핫인코딩
# train=pd.get_dummies(train)
# test=pd.get_dummies(test)

full = pd.concat([train, test])
full_dummies = pd.get_dummies(full)
n_train = len(train)
train = full_dummies[:n_train]
test = full_dummies[n_train:]
print(train.shape, test.shape)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_val)

from sklearn.metrics import mean_squared_error #root이므로 squared를 False
result = mean_squared_error(y_val, pred, squared=False)
print('rmse', result)

pred = model.predict(test)
submit = pd.DataFrame({
    'pred':pred
})
#submit.to_csv('result.csv', index=False)

(10505, 37) (4502, 37)
rmse 4349.574952340415


#### 중고차 가격 예측

In [62]:
# 1. 문제정의
# 평가: RMSLE
# target: Price
# 최종파일: result.csv(컬럼 1개 pred)

# 2. 라이브러리 및 데이터 불러오기
import pandas as pd
# train = pd.read_csv("car_train.csv")
# test = pd.read_csv("car_test.csv")
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch8/car_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch8/car_test.csv")

# 3. 탐색적 데이터 분석(EDA)
print("===== 데이터 크기 =====")
print(train.shape, test.shape)

print("\n ===== train 데이터 샘플 =====")
print(train.head(1))

print("\n ===== test 데이터 샘플 =====")
print(test.head(1))

print("\n ===== 데이터 정보(자료형) =====")
print(train.info())

print("\n ===== train 결측치 수 =====")
print(train.isnull().sum().sum())

print("\n ===== test 결측치 수 =====")
print(test.isnull().sum().sum())

print("\n ===== 카테고리 비교 =====")
cols = train.select_dtypes(include='object').columns
for col in cols:
    set_train = set(train[col])
    set_test= set(test[col])
    same = (set_train == set_test)
    if same:
        print(col, "\t카테고리 동일함")
    else:
        print(col, "\t카테고리 동일하지 않음")

print("\n ===== target 기술 통계 =====")
print(train['Price'].describe())

===== 데이터 크기 =====
(6732, 17) (5772, 16)

 ===== train 데이터 샘플 =====
   Price Levy Manufacturer   Model  Prod. year Category Leather interior  \
0  13956  603        LEXUS  RX 450        2015     Jeep              Yes   

  Fuel type Engine volume    Mileage  Cylinders Gear box type Drive wheels  \
0    Hybrid           3.5  143619 km        6.0     Automatic          4x4   

    Doors       Wheel  Color  Airbags  
0  04-May  Left wheel  Black       12  

 ===== test 데이터 샘플 =====
  Levy Manufacturer   Model  Prod. year Category Leather interior Fuel type  \
0  730    SSANGYONG  Actyon        2016     Jeep              Yes    Petrol   

  Engine volume   Mileage  Cylinders Gear box type Drive wheels   Doors  \
0           1.6  70940 km        4.0     Automatic        Front  04-May   

        Wheel  Color  Airbags  
0  Left wheel  Black        4  

 ===== 데이터 정보(자료형) =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6732 entries, 0 to 6731
Data columns (total 17 columns):
 #   Colu

In [63]:
target = train.pop('Price')

# 레이블 인코딩

from sklearn.preprocessing import LabelEncoder
combined = pd.concat([train,test])
cols = train.select_dtypes(include='object').columns

for col in cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])


n_train = len(train)
train = combined[:n_train]
test = combined[n_train:]

# 5. 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 6. 머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=0)
rf.fit(X_tr, y_tr)
pred = rf.predict(X_val)

# RMSLE
from sklearn.metrics import mean_squared_log_error
result = mean_squared_log_error(y_val, pred) ** 0.5
print('rmsle:', result)

# 7. 예측 및 결과 파일 생성
pred = rf.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv("result.csv", index=False)

rmsle: 1.1008952910276844
