In [3]:
# import pandas as pd

# # Load datasets
# train_path = './data/train.csv'
# test_path = './data/test.csv'
# sample_submission_path = './data/sample_submission.csv'

# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)
# sample_submission_df = pd.read_csv(sample_submission_path)

# # Display the first few rows of each dataset
# print("Train DataFrame:")
# print(train_df.head())
# print("\nTest DataFrame:")
# print(test_df.head())
# print("\nSample Submission DataFrame:")
# print(sample_submission_df.head())

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load datasets
train_path = './data/train.csv'
test_path = './data/test.csv'
sample_submission_path = './data/sample_submission.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission_df = pd.read_csv(sample_submission_path)

# Feature Engineering - Create new features or transformations if necessary
def feature_engineering(df):
    df = df.copy()
    # Example: Create a feature that is the log of a numeric column (assuming 'price' exists)
    if 'price' in df.columns:
        df['log_price'] = np.log1p(df['price'])
    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# Preprocessing
numeric_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('label')  # Exclude target column

categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

# Define column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Split features and target
X = train_df.drop('label', axis=1)
y = train_df['label']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')

# Validation
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')

# Predictions on test set
test_predictions = best_model.predict(test_df)

# Prepare submission
submission_df = sample_submission_df.copy()
submission_df['label'] = test_predictions
submission_df['label'] = submission_df['label'].astype(int)

# Save submission
submission_df.to_csv('./data/submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Validation Accuracy: 0.6693877551020408
Submission file saved as 'submission.csv'
