In [25]:
import pandas as pd

# Load data
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [26]:
# Check data
print(train_data.head())
print(test_data.head())
print(sample_submission.head())

   id        현재가      전일비    액면가     시가총액      상장주식수  외국인비율          거래량  \
0   0   2351.703  116.656  500.0  832.124  35392.000  1.658  2706392.610   
1   1  11687.402   51.515    0.0  304.111   2600.000  0.490    50847.441   
2   2   3100.000    0.000  500.0  969.000  31257.000  0.370        0.000   
3   3   9408.961  254.445    0.0  292.195   3105.573  1.905   132966.463   
4   4   2226.067  114.968  100.0  605.343  27191.000  2.551   522215.695   

      PER    ROE  label  
0  19.116   6.50      0  
1     NaN    NaN      0  
2   7.088  18.71      1  
3     NaN    NaN      0  
4     NaN    NaN      2  
   id        현재가      전일비     액면가       시가총액     상장주식수  외국인비율         거래량  \
0   0  65198.863  304.482     0.0   1212.443    1860.0  0.081    4793.170   
1   1   6406.595  171.084   500.0   3711.403   57931.0  1.209  282689.080   
2   2   4555.651  217.911   500.0   1596.002   35038.0  5.360  214691.924   
3   3  25048.754  490.409  5000.0  26874.700  107291.0  4.048  418903.868   
4 

In [27]:

# Preprocessing

# Check missing values
# print(train_data.isnull().sum())
# print(test_data.isnull().sum())

# Drop missing values
# train_data = train_data.dropna()
# test_data = test_data.dropna()

# 필요시 결측치 처리 (예: 평균값으로 채우기)
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

# Check missing values again
print(train_data.isnull().sum())
print(test_data.isnull().sum())

id       0
현재가      0
전일비      0
액면가      0
시가총액     0
상장주식수    0
외국인비율    0
거래량      0
PER      0
ROE      0
label    0
dtype: int64
id       0
현재가      0
전일비      0
액면가      0
시가총액     0
상장주식수    0
외국인비율    0
거래량      0
PER      0
ROE      0
dtype: int64


In [28]:
# Feature and Lable 분리
train_features = train_data.drop(columns=['label'], axis=1) # label은 예측할 값이라고 가정
train_labels = train_data['label']

In [29]:
# Data Normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_data)



## Step3 모델 선택 및 학습
Random Forest 모델을 사용해 예측 진행 

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Cross-validation
cv_scores = cross_val_score(model, train_features_scaled, train_labels, cv=5, scoring='neg_mean_absolute_error')

print(f'Cross-validated MSE: {-cv_scores.mean()}')

Cross-validated MSE: 0.8656897959183674


In [31]:
# 길이 확인
print(f"Lenght of test data: {len(test_data)}")
print(f"Lenght of predictions: {len(predictions)}")
print(f"Lenght of sample_submission: {len(sample_submission)}")

Lenght of test data: 1207
Lenght of predictions: 750
Lenght of sample_submission: 1207


In [32]:
# 필요시 test_data를 sample_submission의 길이에 맞춰 조정
if len(test_data) != len(predictions):
  test_data = test_data.head(len(sample_submission))

In [33]:
# 데이터 정규화 (변경된 test_data를 사용)
test_features_scaled = scaler.transform(test_data)

# Train the model
model.fit(train_features_scaled, train_labels)

# Predict the test data
predictions = model.predict(test_features_scaled)

In [34]:
# Save the predictions to a CSV file
sample_submission['label'] = predictions

In [35]:
# Save the predictions to a CSV file
sample_submission['label'] = predictions
sample_submission.to_csv('./data/submission.csv', index=False)

In [37]:
# MAE

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_features_scaled, train_labels, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
val_predictions = model.predict(X_val)

mae = mean_absolute_error(y_val, val_predictions)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 0.8518775510204082
