# 1. 분류_당뇨병_머신러닝

## (1) import

In [1]:
import pandas as pd

# 전처리
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 모델
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

# 예측
from sklearn.metrics import accuracy_score, classification_report

# 모델 저장
import pickle

## (2) 데이터 확인

In [3]:
folder_path = ''

In [None]:
train = pd.read_csv(folder_path + 'train.csv')
test = pd.read_csv(folder_path + 'test.csv')

In [None]:
display(train.head())
display(test.head())

In [None]:
## Feature 와 Label 분리
X_train = train.drop(columns=["Outcome"])
y = train["Outcome"]
X_test = test.copy()

## (3) 데이터 전처리

### 1. 레이블링 ( 문자 -> 숫자로 인코딩 )

In [None]:
encoder = LabelEncoder()
X_train = encoder.fit_transform(X_train['Sex'])
X_test = encoder.transform(X_test['Sex']) # test 데이터는 fit_transform이 아니라 transform만 해준다. 왜지? fit은 train 데이터에만 적용해야 하기 때문이다.

### 2. 스케일링 ( 표준화 )

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## (4) 학습

In [None]:
# train, validation 분류
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2, random_state=42)

In [None]:
# 학습
model = LogisticRegression()
model.fit(X_train, y_train)

## (5) 예측 및 평가

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"모델 정확도: {accuracy}")
print("분류 보고서:\n", report)

## (6) test 데이터 예측

In [None]:
filename = "01073002902_1.h5"

pickle.dump(model, open(filename, 'wb'))

In [None]:
pred = model.predict(test)

In [None]:
test["Outcome"] = pred

## (7) 결과 제출

In [None]:
test.to_csv("01073002902_1.csv", index=False)

# 2. 분류_타이타닉생존자_신경망_머신러닝

https://pinkwink.kr/1119 참고

# (1) import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 학습
from sklearn.model_selection import train_test_split

# 신경망 모델
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Reshape, Permute

# 머신러닝 모델
from sklearn.linear_model import LogisticRegression

# 평가
from sklearn.metrics import accuracy_score, classification_report

## (2) 데이터 확인

In [9]:
folder_path = ''

In [None]:
train = pd.read_csv(folder_path + 'train.csv')
test = pd.read_csv(folder_path + 'test.csv')

## (2) 데이터 전처리

### 1. 문자 -> 숫자 인코딩

In [None]:
train['sex'] = train['sex'].map({'female': 1, 'male': 0})
test['sex'] = test['sex'].map({'female': 1, 'male': 0})

### 2. int -> float 형으로 변환 ( 왜지? )

In [None]:
cols_to_float = ['survived', 'pclass', 'sex', 'sibsp', 'parch', 'fare']
train[cols_to_float] = train[cols_to_float].astype('float')

test[cols_to_float] = test[cols_to_float].astype('float')

### 3. Null 값이 있는 행 삭제

In [None]:
train = train.dropna(subset=['age', 'sibsp', 'parch', 'fare'])
test = test.dropna(subset=['age', 'sibsp', 'parch', 'fare'])

## (3) 학습

In [None]:
# train 데이터에서 feature와 label 분리 ( 필요한 컬럼만 추출: pclass, sex, age, sibsp, parch, fare )
X_train = train.values[:, [0,3,4,5,6,8]]
y = train["survived"]

test = test.values[:, [0,3,4,5,6,8]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.1, random_state=7)

### 1. 신경망

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model = Sequential()
model.add(Dense(255, input_shpae=(X_train.shape[1],), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='mse', optimizer='Adam', metrics=['accuracy'])
model.summary()

In [None]:
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)

### 2. 머신러닝

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.1, random_state=7)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

## (4) 예측 및 평가

In [None]:
# 신경망
hist = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=500)

plt.figure(figsize=(12, 8))
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.legend(['loss', 'val_loss', 'accuracy', 'val_accuracy'])
plt.show()

In [None]:
# 머신러닝
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"모델 정확도: {accuracy}")
print("분류 보고서:\n", report)

## (5) test 데이터 예측

In [None]:
# 신경망
result = []

for item in test:
    item = np.array(item, dtype=np.float32)
    pred = model.predict(item.reshape(1, 6))
    if pred >= 0.5:
        pred = 1
    else:
        pred = 0
    result.append(pred)

In [None]:
test['result'] = result

In [None]:
# 머신러닝
y_pred = model.predict(test)
# 위와 동일..?

## (6) 결과 제출

In [None]:
pickle.dump(model, open('01073002902_1.h5', 'wb'))
test.to_csv('01073002902_1.csv', index=False)

# 3. 회귀_당뇨병

## (1) import

In [None]:
import pandas as pd

# 학습
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# 평가
from sklearn.metrics import mean_squared_error
from math import sqrt

# 제출
import pickle

## (2) 데이터 확인

In [12]:
folder_path = ''

In [None]:
train = pd.read_csv(folder_path + 'train.csv')
test = pd.read_csv(folder_path + 'test.csv')

## (3) 데이터 전처리

In [14]:
## 숫자 인코딩 되어 있음?
## 스케일링 이미 되어 있음?
## -> 이러면 pass

## (4) 학습

In [None]:
# train 데이터에서 feature와 label 분리
X_train = train.drop(columns=["target"])
y = train["target"]
display(X_train.head())
display(y.head())

In [None]:
X_train, y_train, X_test, y_test = train_test_split(X_train, y, test_size=0.3, random_state=42)

In [None]:
multi_regressor = LinearRegression()
multi_regressor.fit(X_train, y_train)

## (5) 평가

In [None]:
multi_test_pred = multi_regressor.predict(X_test)

In [None]:
# 학습시킨 모델에 test(정확히는 validation)를 넣어 예측
# 이후 예측값과 실제값을 비교하여 모델의 성능을 평가
multi_test_mse = mean_squared_error(multi_test_pred, y_test)

In [None]:
# 기준에 만족하면 모델 저장
print(f"Multi Regression Test is {sqrt(multi_test_mse)}")

## (6) 결과 제출

In [None]:
pickle.dump(multi_regressor, open('01073002902_1.h5', 'wb'))

In [None]:
# 진짜 test 데이터 예측
multi_test_pred = multi_regressor.predict(test)

In [None]:
test['target'] = multi_test_pred
test.to_csv('01073002902_1.csv', index=False)