<a href="https://colab.research.google.com/github/sgu20191816/gpt_dacon/blob/main/gpt_dacon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/open/train.csv')
test = pd.read_csv('/content/drive/MyDrive/open/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/open/sample_submission.csv')

# TfidfVectorizer를 사용하여 문서를 벡터화
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=30000)
X = vectorizer.fit_transform(train['text'])
y = train['label']
X_test = vectorizer.transform(test['text'])

# 10개의 fold로 나누기
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# 각 fold마다 LinearSVC 모델 학습시키기
svc_preds = []
for train_idx, val_idx in skf.split(X, y):
  X_train, y_train = X[train_idx], y[train_idx]
  X_val, y_val = X[val_idx], y[val_idx]
  svc_model = LinearSVC(C=0.1)
  svc_model.fit(X_train, y_train)
  svc_pred = svc_model.predict(X_val)
  svc_preds.append(svc_pred)

# 각 fold마다 XGBoost 모델 학습시키기
xgb_preds = []
for train_idx, val_idx in skf.split(X, y):
  X_train, y_train = X[train_idx], y[train_idx]
  X_val, y_val = X[val_idx], y[val_idx]
  xgb_model = XGBClassifier(max_depth=6, n_estimators=200, learning_rate=0.05, objective='multi:softmax')
  xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=2, early_stopping_rounds=50)
  xgb_pred = xgb_model.predict(X_val)
  xgb_preds.append(xgb_pred)


In [None]:
# LinearSVC와 XGBoost 모델의 예측 결과를 concatenate
meta_X = pd.concat([pd.DataFrame(svc_preds), pd.DataFrame(xgb_preds)], axis=1)
meta_y = pd.concat([pd.Series(y[val_idx]) for train_idx, val_idx in skf.split(X, y)])

# 메타 모델 학습
meta_model = XGBClassifier(max_depth=6, n_estimators=200, learning_rate=0.05, objective='multi:softmax')
meta_model.fit(meta_X, meta_y)

# 메타 모델을 사용하여 test 데이터 예측하기
svc_test_preds = svc_model.predict(X_test)
xgb_test_preds = xgb_model.predict(X_test)
meta_X_test = pd.concat([pd.DataFrame(svc_test_preds), pd.DataFrame(xgb_test_preds)], axis=1)
test_preds = meta_model.predict(meta_X_test)

# 제출용 파일 생성
submission['label'] = test_preds
submission.to_csv('/content/drive/MyDrive/open/submission.csv', index=False)
