In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [4]:

import os

# Step 1: Load dataset correctly
def load_data(train_path, test_path):
    # Automatically detect the correct number of columns
    with open(train_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    expected_columns = min([len(line.split(",")) for line in lines])  # Use min column count

    # Load dataset with corrected column count
    train_df = pd.read_csv(train_path, sep=",", header=None, usecols=range(expected_columns))
    test_df = pd.read_csv(test_path, sep=",", header=None, usecols=range(expected_columns))

    print("✅ Train and test datasets loaded successfully with", expected_columns, "columns.")
    return train_df, test_df


In [6]:
# Step 2: Preprocess dataset
def preprocess_data(train_df, test_df):
    # Ensure correct data format
    train_df.iloc[:, 2:] = train_df.iloc[:, 2:].apply(pd.to_numeric, errors='coerce')
    test_df.iloc[:, 1:] = test_df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
    return train_df, test_df

In [8]:
# Step 3: Extract features
def extract_features(df):
    df_numeric = df.iloc[:, 2:].select_dtypes(include=[np.number])
    features = df_numeric.apply(lambda row: [row.mean(), row.std(), row.min(), row.max()], axis=1)
    feature_df = pd.DataFrame(features.tolist(), columns=['mean', 'std', 'min', 'max'])
    return feature_df


In [10]:
# Step 4: Train model
def train_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    model = LogisticRegression(max_iter=500)
    model.fit(X_train_scaled, y_train)
    return model, scaler


In [12]:
# Step 5: Evaluate model
def evaluate_model(model, scaler, X_val, y_val):
    X_val_scaled = scaler.transform(X_val)
    y_pred = model.predict(X_val_scaled)
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    return accuracy, f1


In [14]:
# Step 6: Main execution
train_path = 'upload/train.txt'
test_path = 'upload/test.txt'

# Load data
train_df, test_df = load_data(train_path, test_path)

# Preprocess data
train_df, test_df = preprocess_data(train_df, test_df)

# Extract features
X_train = extract_features(train_df)
y_train = train_df.iloc[:, 1]
X_test = extract_features(test_df)

# Split data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train model
model, scaler = train_model(X_train_split, y_train_split)

# Evaluate model
accuracy, f1 = evaluate_model(model, scaler, X_val, y_val)
print(f'✅ Model Accuracy: {accuracy:.4f}')
print(f'✅ Model F1-score: {f1:.4f}')

# Standardize X_test using the same scaler
X_test_scaled = scaler.transform(X_test)
# Make predictions
test_predictions = model.predict(X_test_scaled)

# Format the predictions correctly
submission_df = pd.DataFrame({
    'index': np.arange(len(test_predictions)),  # Ensure index is integer
    'label': test_predictions                   # Ensure labels are in string format
})

# Save as CSV
submission_df.to_csv("submission.csv", index=False)
print("✅ Submission file 'submission.csv' created successfully!")

# Verify output
print("🔍 First few rows of submission file:\n", submission_df.head())

✅ Train and test datasets loaded successfully with 98 columns.
✅ Model Accuracy: 0.7441
✅ Model F1-score: 0.7414
✅ Submission file 'submission.csv' created successfully!
🔍 First few rows of submission file:
    index        label
0      0  cobblestone
1      1  cobblestone
2      2         dirt
3      3         dirt
4      4         dirt
