In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer            # SimpleImputer: 결측값을 특정 값(평균, 중앙값 등)으로 대체
from sklearn.preprocessing import OneHotEncoder


In [7]:
# Load datasets
train_path = './data/train.csv'
test_path = './data/test.csv'
sample_submission_path = './data/sample_submission.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission_df = pd.read_csv(sample_submission_path)

In [8]:
train_df.head(5)

Unnamed: 0,id,현재가,전일비,액면가,시가총액,상장주식수,외국인비율,거래량,PER,ROE,label
0,0,2351.703,116.656,500.0,832.124,35392.0,1.658,2706392.61,19.116,6.5,0
1,1,11687.402,51.515,0.0,304.111,2600.0,0.49,50847.441,,,0
2,2,3100.0,0.0,500.0,969.0,31257.0,0.37,0.0,7.088,18.71,1
3,3,9408.961,254.445,0.0,292.195,3105.573,1.905,132966.463,,,0
4,4,2226.067,114.968,100.0,605.343,27191.0,2.551,522215.695,,,2


In [9]:
# Feature Engineering - Create new features or transformations if necessary

# def feature_engineering(df):
#     df = df.copy()
#     # Example: Create a feature that is the log of a numeric column (assuming 'price' exists)
#     if 'price' in df.columns:
#         df['log_price'] = np.log1p(df['price'])
#     return df

In [12]:
import numpy as np

def feature_engineering(df):
    df = df.copy()
    
    # Log transform for '현재가' (assuming '현재가' is the price)
    if '현재가' in df.columns:
        df['log_현재가'] = np.log1p(df['현재가'])
    
    # Log transform for '시가총액' (Market Cap) and '거래량' (Trading Volume)
    if '시가총액' in df.columns:
        df['log_시가총액'] = np.log1p(df['시가총액'])
    if '거래량' in df.columns:
        df['log_거래량'] = np.log1p(df['거래량'])
    
    # Impute missing values for 'PER' and 'ROE'
    for col in ['PER', 'ROE']:
        df[col] = df[col].fillna(df[col].median())
    
    # Create interaction features
    if '시가총액' in df.columns and '상장주식수' in df.columns:
        df['평균주가'] = df['시가총액'] / df['상장주식수']
    if '외국인비율' in df.columns and '거래량' in df.columns:
        df['외국인_거래량'] = df['외국인비율'] * df['거래량']
    
    # Create binary feature for missing 'PER' and 'ROE'
    df['PER_missing'] = df['PER'].isnull().astype(int)
    df['ROE_missing'] = df['ROE'].isnull().astype(int)
    
    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

test_df.head(5)


Unnamed: 0,id,현재가,전일비,액면가,시가총액,상장주식수,외국인비율,거래량,PER,ROE,log_현재가,log_시가총액,log_거래량,평균주가,외국인_거래량,PER_missing,ROE_missing
0,0,65198.863,304.482,0.0,1212.443,1860.0,0.081,4793.17,7.922,4.38,11.085213,7.101217,8.475156,0.651851,388.2468,0,0
1,1,6406.595,171.084,500.0,3711.403,57931.0,1.209,282689.08,17.645,22.02,8.765239,8.219435,12.552106,0.064066,341771.1,0,0
2,2,4555.651,217.911,500.0,1596.002,35038.0,5.36,214691.924,-216.934,4.38,8.424343,7.375883,12.276964,0.045551,1150749.0,0,0
3,3,25048.754,490.409,5000.0,26874.7,107291.0,4.048,418903.868,-1.54,-117.79,10.128619,10.198978,12.945399,0.250484,1695723.0,0,0
4,4,6526.482,22.684,500.0,1165.662,17858.0,0.959,10646.274,-25.1,-1.96,8.783777,7.061902,9.273059,0.065274,10209.78,0,0


In [None]:
# Preprocessing
numeric_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist() 
numeric_features.remove('label')  # Exclude target column

categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

In [13]:




# Define column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Split features and target
X = train_df.drop('label', axis=1)
y = train_df['label']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')

# Validation
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')

# Predictions on test set
test_predictions = best_model.predict(test_df)

# Prepare submission
submission_df = sample_submission_df.copy()
submission_df['label'] = test_predictions
submission_df['label'] = submission_df['label'].astype(int)

# Save submission
submission_df.to_csv('./data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Validation Accuracy: 0.6612244897959184
Submission file saved as 'submission.csv'
