In [4]:
import pandas as pd

# train_data = pd.read_csv('./data/train.csv')
# test_data = pd.read_csv('./data/test.csv')
# sample_submission = pd.read_csv('./data/sample_submission.csv')

# Load datasets
train_path = './data/train.csv'
test_path = './data/test.csv'
sample_submission_path = './data/sample_submission.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission_df = pd.read_csv(sample_submission_path)

# Display the first few rows of each dataset
print("Train DataFrame:")
print(train_df.head())
print("\nTest DataFrame:")
print(test_df.head())
print("\nSample Submission DataFrame:")
print(sample_submission_df.head())

Train DataFrame:
   id        현재가      전일비    액면가     시가총액      상장주식수  외국인비율          거래량  \
0   0   2351.703  116.656  500.0  832.124  35392.000  1.658  2706392.610   
1   1  11687.402   51.515    0.0  304.111   2600.000  0.490    50847.441   
2   2   3100.000    0.000  500.0  969.000  31257.000  0.370        0.000   
3   3   9408.961  254.445    0.0  292.195   3105.573  1.905   132966.463   
4   4   2226.067  114.968  100.0  605.343  27191.000  2.551   522215.695   

      PER    ROE  label  
0  19.116   6.50      0  
1     NaN    NaN      0  
2   7.088  18.71      1  
3     NaN    NaN      0  
4     NaN    NaN      2  

Test DataFrame:
   id        현재가      전일비     액면가       시가총액     상장주식수  외국인비율         거래량  \
0   0  65198.863  304.482     0.0   1212.443    1860.0  0.081    4793.170   
1   1   6406.595  171.084   500.0   3711.403   57931.0  1.209  282689.080   
2   2   4555.651  217.911   500.0   1596.002   35038.0  5.360  214691.924   
3   3  25048.754  490.409  5000.0  26874.700 

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Preprocessing
def preprocess_data(df):
    df = df.copy()
    # Handle missing values
    df = df.fillna(df.mean())
    # Convert categorical columns to numerical if there are any (assuming 'label' is the target)
    if 'label' in df.columns:
        df = pd.get_dummies(df, columns=[col for col in df.columns if df[col].dtype == 'object' and col != 'label'])
    else:
        df = pd.get_dummies(df, columns=[col for col in df.columns if df[col].dtype == 'object'])
    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Split features and target
X = train_df.drop('label', axis=1)
y = train_df['label']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Validation
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')

# Predictions on test set
test_predictions = model.predict(test_df)

# Prepare submission
submission_df = sample_submission_df.copy()
submission_df['label'] = test_predictions
submission_df['label'] = submission_df['label'].astype(int)

# Save submission
submission_df.to_csv('./data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


Validation Accuracy: 0.6285714285714286
Submission file saved as 'submission.csv'
