# Forest Cover Type ‚Äì Training Notebook
Algorithms: Logistic Regression, SVM, MLP Neural Network.

**Goal:** Achieve best accuracy using optimized model configurations.

## 1. Load Data

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv("covtype.csv")

print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())

# Separate features and target
X = df.drop(columns=["Cover_Type"])
y = df["Cover_Type"]

# Identify numerical columns (continuous features - need scaling)
numerical_cols = ['Elevation', 'Aspect', 'Slope', 
                  'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
                  'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
                  'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']

# Identify binary columns (already one-hot encoded - no scaling needed)
wilderness_cols = [col for col in X.columns if col.startswith('Wilderness_Area')]
soil_cols = [col for col in X.columns if col.startswith('Soil_Type')]
binary_cols = wilderness_cols + soil_cols

print(f"\nNumerical features ({len(numerical_cols)}): {numerical_cols}")
print(f"Binary features ({len(binary_cols)}): Wilderness_Area (4) + Soil_Type (40)")

# Check for missing values
print(f"\nMissing values: {df.isnull().sum().sum()}")

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create preprocessing pipeline - scale only numerical features
# Binary features (Wilderness_Area and Soil_Type) remain unchanged
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('bin', 'passthrough', binary_cols)
    ],
    remainder='drop'
)

# Fit and transform the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"\nTraining set size: {X_train_processed.shape[0]}")
print(f"Test set size: {X_test_processed.shape[0]}")
print(f"Number of features: {X_train_processed.shape[1]}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Class distribution:\n{y.value_counts().sort_index()}")


Dataset Shape: (581012, 55)

Column Names:
['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40', 'Cover_Type']

Numerical features (10): ['Elevation', 'Asp

## 2. Logistic Regression

In [2]:
# Logistic Regression with optimized parameters
# Note: 'multi_class' parameter removed (deprecated in sklearn 1.3+)
# sklearn now automatically uses multinomial for multi-class problems
lr = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    n_jobs=-1,
    random_state=42
)
lr.fit(X_train_processed, y_train)
pred_lr = lr.predict(X_test_processed)

lr_accuracy = accuracy_score(y_test, pred_lr)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print(classification_report(y_test, pred_lr))


Logistic Regression Accuracy: 0.7234
              precision    recall  f1-score   support

           1       0.71      0.70      0.70     42368
           2       0.75      0.80      0.77     56661
           3       0.68      0.80      0.73      7151
           4       0.60      0.43      0.50       549
           5       0.17      0.01      0.01      1899
           6       0.50      0.28      0.36      3473
           7       0.74      0.56      0.63      4102

    accuracy                           0.72    116203
   macro avg       0.59      0.51      0.53    116203
weighted avg       0.71      0.72      0.71    116203



## 3. Support Vector Machine (LinearSVC)

In [3]:
# LinearSVC with optimized parameters
svm = LinearSVC(
    C=1.0,
    max_iter=2000,
    dual=True,
    random_state=42
)
svm.fit(X_train_processed, y_train)
pred_svm = svm.predict(X_test_processed)

svm_accuracy = accuracy_score(y_test, pred_svm)
print(f"SVM (LinearSVC) Accuracy: {svm_accuracy:.4f}")
print(classification_report(y_test, pred_svm))


SVM (LinearSVC) Accuracy: 0.7114
              precision    recall  f1-score   support

           1       0.71      0.68      0.69     42368
           2       0.73      0.80      0.76     56661
           3       0.61      0.87      0.72      7151
           4       0.62      0.20      0.30       549
           5       0.56      0.01      0.02      1899
           6       0.43      0.06      0.10      3473
           7       0.68      0.51      0.58      4102

    accuracy                           0.71    116203
   macro avg       0.62      0.45      0.46    116203
weighted avg       0.70      0.71      0.70    116203



## 4. Neural Network (MLPClassifier)

In [4]:
# MLP Neural Network with optimized hyperparameters
# Based on grid search results from reference, (100, 100) hidden layers work best
mlp = MLPClassifier(
    hidden_layer_sizes=(100, 100),
    max_iter=300,
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='adaptive',
    learning_rate_init=0.001,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42,
    verbose=True
)

print("Training MLP Neural Network...")
mlp.fit(X_train_processed, y_train)
pred_mlp = mlp.predict(X_test_processed)

mlp_accuracy = accuracy_score(y_test, pred_mlp)
print(f"\nMLP Neural Network Accuracy: {mlp_accuracy:.4f}")
print(classification_report(y_test, pred_mlp))


Training MLP Neural Network...
Iteration 1, loss = 0.57007138
Validation score: 0.794518
Iteration 2, loss = 0.45201054
Validation score: 0.821970
Iteration 3, loss = 0.40961381
Validation score: 0.835632
Iteration 4, loss = 0.38298840
Validation score: 0.839827
Iteration 5, loss = 0.36257820
Validation score: 0.849207
Iteration 6, loss = 0.34666147
Validation score: 0.860373
Iteration 7, loss = 0.33391534
Validation score: 0.862546
Iteration 8, loss = 0.32384433
Validation score: 0.868247
Iteration 9, loss = 0.31554927
Validation score: 0.871517
Iteration 10, loss = 0.30730135
Validation score: 0.876035
Iteration 11, loss = 0.30156292
Validation score: 0.874852
Iteration 12, loss = 0.29512467
Validation score: 0.873217
Iteration 13, loss = 0.28954229
Validation score: 0.883953
Iteration 14, loss = 0.28478956
Validation score: 0.883114
Iteration 15, loss = 0.28021918
Validation score: 0.881694
Iteration 16, loss = 0.27637071
Validation score: 0.886341
Iteration 17, loss = 0.27239547
Va

## 6. Model Comparison Summary

In [11]:
# Model Comparison Summary
import pandas as pd

results = {
    'Model': ['Logistic Regression', 'SVM (LinearSVC)', 'MLP Neural Network'],
    'Accuracy': [lr_accuracy, svm_accuracy, mlp_accuracy]
}

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Accuracy', ascending=False).reset_index(drop=True)

print("=" * 50)
print("MODEL COMPARISON - FOREST COVER TYPE PREDICTION")
print("=" * 50)
print(results_df.to_string(index=False))
print("=" * 50)
print(f"\nüèÜ Best Model: {results_df.iloc[0]['Model']}")
print(f"   Best Accuracy: {results_df.iloc[0]['Accuracy']:.4f} ({results_df.iloc[0]['Accuracy']*100:.2f}%)")

MODEL COMPARISON - FOREST COVER TYPE PREDICTION
              Model  Accuracy
 MLP Neural Network  0.922050
Logistic Regression  0.723381
    SVM (LinearSVC)  0.711410

üèÜ Best Model: MLP Neural Network
   Best Accuracy: 0.9221 (92.21%)
