In [1]:
# 전처리 과정
import pandas as pd
from sklearn.model_selection import train_test_split

# 데이터 불러오기
df = pd.read_csv('../data/movie_info.csv')
df1 = pd.read_csv('../data/review-rating1.csv')
df2 = pd.read_csv('../data/review-rating2.csv')
df3 = pd.read_csv('../data/review-rating3.csv')
df4 = pd.read_csv('../data/review-rating4.csv')
df5 = pd.concat([df1, df2, df3, df4])
                                                    
# 결측치 제거
df5.dropna(inplace=True)

# 개봉일에서 제작연도만 빼내기
year_lis = []
for date in df['개봉일']:
    date_lis = []
    date_lis = date.split(".")
    if len(date_lis[0]) > 4:
        date_lis = date_lis[0].split("-")
    year_lis.append(date_lis[0])
df['제작연도'] = year_lis

# 두 데이터 프레임 합치기
df = pd.merge(df, df5, how='right', on='영화ID')

# 결측치 제거
df.dropna(subset=['누적매출액'], inplace=True)

# 정형 데이터만 뽑기
df = df[['제작연도', '누적매출액', '누적관객수', '스크린수', '상영횟수', '평균평점', '실제평점']]

# 데이터 타입 변경
df['제작연도'] = df['제작연도'].astype('int64')

# csv로 저장
df.to_csv("../data/review_info.csv", index=False)

# 데이터를 X, Y로 나누고 학습용, 검증용으로 나누기
X = df[['제작연도', '누적매출액', '누적관객수', '스크린수', '상영횟수', '평균평점']]
y = df['실제평점']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [3]:
# 하이퍼파라미터 튜닝
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from keras import models
from keras import layers

# 모델 생성하는 함수
def create_network(optimizer="rmsprop"):
    network = models.Sequential()
    network.add(layers.Dense(units=32, activation="relu", input_shape=(X_train.shape[1],)))
    network.add(layers.Dense(units=16, activation="relu"))
    network.add(layers.Dense(units=1, activation="linear"))
    network.compile(loss='mse', 
                    optimizer=optimizer, # 옵티마이저 
                    metrics=["mse"]) # 성능 지표
    return network

# 모델 생성
neural_network = KerasClassifier(model=create_network, verbose=0)

# 하이퍼파라미터 탐색 영역을 정의
pipe=Pipeline([
    ('scaler', StandardScaler()),
    ('clf', neural_network)
    ])
params={
    'clf__epochs': [10, 15],
    'clf__batch_size': [32,64,128],
    'clf__optimizer': ["rmsprop", "adam"],
    }

# 파라미터 검색
grid=GridSearchCV(pipe, params, scoring='neg_mean_squared_error', cv=3)
grid_result = grid.fit(X, y)

# 결과 출력
print(grid_result.best_params_)
print(grid_result.best_score_)

In [None]:
# 최적의 파라미터로 모델 마무리
from keras import models
from keras import layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

# 모델 생성
model = models.Sequential()
model.add(layers.Dense(units=32, activation="relu", input_shape=(X_train.shape[1],), name="Hidden-1"))
model.add(layers.Dense(units=16, activation="relu", name="Hidden-2"))
model.add(layers.Dense(units=1, activation="linear", name="Output"))
model.compile(loss='mse', optimizer="rmsprop", metrics=["mse"])
model.summary()

# 모델 학습
es = EarlyStopping(monitor='val_loss', mode='min', patience=3)
mc = ModelCheckpoint('../model/review_num.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)
hist = model.fit(X_train, y_train, batch_size = 32, epochs=10, validation_split=0.2, callbacks=[es, mc])

# 모델 평가
loaded_model = load_model('../model/review_num.h5')
loaded_model.evaluate(X_test, y_test)

In [None]:
# 모델 학습 과정 표시
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.plot(hist.history['loss']) #손실
plt.plot(hist.history['val_loss']) #validation 손실
plt.legend(['loss','val_loss'])
plt.grid()
plt.show()