In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 1. Load the datasets
train_data = pd.read_csv('../input/playground-series-s6e1/train.csv')  # Kaggle path for train dataset
test_data = pd.read_csv('../input/playground-series-s6e1/test.csv')  # Kaggle path for test dataset

# 2. Check column names for both train and test data
print("Train Data Columns:", train_data.columns)
print("Test Data Columns:", test_data.columns)

# 3. Identify categorical columns (ignore 'id' column)
categorical_columns = train_data.select_dtypes(include=['object']).columns
categorical_columns = categorical_columns.drop('id', errors='ignore')  # Drop 'id' if it exists, otherwise ignore
print(f"Categorical columns: {categorical_columns}")

# 4. Label Encoding for categorical features
label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in categorical_columns:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])  # Ensure the test data is encoded the same way

# 5. Split data into features (X) and target (y)
if 'id' in train_data.columns:  # Check if 'id' exists before dropping it
    X = train_data.drop(columns=['id', 'exam_score'])  # Drop 'id' and 'exam_score' columns
else:
    X = train_data.drop(columns=['exam_score'])  # Only drop 'exam_score' if 'id' is not present
y = train_data['exam_score']

# 6. Feature Scaling (Standardizing the features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 7. Train-test split (for evaluation purposes)
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 8. Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42)
}

# 9. Train models and evaluate them using RMSE
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(f"{model_name} RMSE: {rmse}")

# 10. Choose the best model (Assume Random Forest performs the best)
best_model = models["Random Forest Regressor"]

# 11. Train the final model on the full training data
best_model.fit(X_scaled, y)

# 12. Prepare the test data for predictions
if 'id' in test_data.columns:  # Check if 'id' exists in test data before dropping it
    X_test = test_data.drop(columns=['id'])  # 'id' is not a feature
else:
    X_test = test_data  # In case 'id' doesn't exist, just use the entire test data without 'id'

X_test_scaled = scaler.transform(X_test)

# 13. Generate predictions
test_predictions = best_model.predict(X_test_scaled)

# 14. Prepare the submission file
submission = pd.DataFrame({
    'id': test_data['id'],  # Ensure 'id' is included in the submission
    'exam_score': test_predictions
})

# 15. Save the submission file (ensure the path is correct for Kaggle)
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")
