In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer            # SimpleImputer: 결측값을 특정 값(평균, 중앙값 등)으로 대체
from sklearn.preprocessing import OneHotEncoder


In [7]:
# Load datasets
train_path = './data/train.csv'
test_path = './data/test.csv'
sample_submission_path = './data/sample_submission.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission_df = pd.read_csv(sample_submission_path)

test_df.head(2)

Unnamed: 0,id,현재가,전일비,액면가,시가총액,상장주식수,외국인비율,거래량,PER,ROE
0,0,65198.863,304.482,0.0,1212.443,1860.0,0.081,4793.17,,
1,1,6406.595,171.084,500.0,3711.403,57931.0,1.209,282689.08,17.645,22.02


In [8]:
# Preprocess data (handle missing values, select features)
train_df_cleaned = train_df.dropna()
test_df_cleaned = test_df.dropna()

test_df_cleaned.head(2)

Unnamed: 0,id,현재가,전일비,액면가,시가총액,상장주식수,외국인비율,거래량,PER,ROE
1,1,6406.595,171.084,500.0,3711.403,57931.0,1.209,282689.08,17.645,22.02
3,3,25048.754,490.409,5000.0,26874.7,107291.0,4.048,418903.868,-1.54,-117.79


In [9]:
# Feature Engineering - Create new features or transformations if necessary

# def feature_engineering(df):
#     df = df.copy()
#     # Example: Create a feature that is the log of a numeric column (assuming 'price' exists)
#     if 'price' in df.columns:
#         df['log_price'] = np.log1p(df['price'])
#     return df

# Feature Engineering - Create new features or transformations if necessary
def feature_engineering(df):
    df = df.copy()
    # Example: Create a feature that is the log of a numeric column (assuming '현재가' exists)
    if '현재가' in df.columns:
        df['log_현재가'] = np.log1p(df['현재가'])
    return df


train_df = feature_engineering(train_df_cleaned)
test_df = feature_engineering(test_df_cleaned)

test_df.head(5)

Unnamed: 0,id,현재가,전일비,액면가,시가총액,상장주식수,외국인비율,거래량,PER,ROE,log_현재가
1,1,6406.595,171.084,500.0,3711.403,57931.0,1.209,282689.08,17.645,22.02,8.765239
3,3,25048.754,490.409,5000.0,26874.7,107291.0,4.048,418903.868,-1.54,-117.79,10.128619
4,4,6526.482,22.684,500.0,1165.662,17858.0,0.959,10646.274,-25.1,-1.96,8.783777
5,5,7709.72,54.738,500.0,662.718,8600.0,0.27,4675.547,10.066,3.85,8.950367
9,9,4430.173,27.614,500.0,632.812,14289.0,1.587,59864.262,8.67,9.83,8.39642


In [10]:
# Preprocessing
numeric_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist() 
numeric_features.remove('label')  # Exclude target column

categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

In [11]:
# Define column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

In [12]:
## Split features and target
# selected_features = ['현재가', '액면가', '상장주식수', '외국인비율', '거래량']
# X = train_df_cleaned[selected_features]
# X = train_df.drop('label', axis=1)
X = train_df.drop(columns=['id', 'label'])
y = train_df['label']

# from sklearn.preprocessing import MinMaxScaler
## Apply Min-Max Scaling
# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X)
# X_test_scaled = scaler.transform(test_df_cleaned[selected_features])


## Split into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [13]:
# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')

ValueError: 
All the 540 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'id'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_indexing.py", line 361, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'id'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/pipeline.py", line 472, in fit
    Xt = self._fit(X, y, routed_params)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/pipeline.py", line 409, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/pipeline.py", line 1329, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/compose/_column_transformer.py", line 969, in fit_transform
    self._validate_column_callables(X)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/compose/_column_transformer.py", line 536, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "/Users/sangjilee/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_indexing.py", line 369, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


In [None]:
# Validation
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')

# Predictions on test set
test_predictions = best_model.predict(test_df)

Validation Accuracy: 0.6314102564102564


In [None]:
print(test_df.shape)
print(sample_submission_df.shape)

NameError: name 'test_df' is not defined

In [None]:
# Prepare submission
submission_df = sample_submission_df.copy()
submission_df['label'] = test_predictions
submission_df['label'] = submission_df['label'].astype(int)

In [None]:
# Save submission
submission_df.to_csv('/content/drive/MyDrive/data/submission.csv', index=False)
print("Submission file saved as 'submission_3rd(2).csv'")

Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Validation Accuracy: 0.6314102564102564


ValueError: Length of values (750) does not match length of index (1207)