<h1 style="text-align:center">Build and Evaluate Classification Models</h1>
<h2 style="text-align:center">Kaggle Submission- Support Vector Machines</h2>
<p style="text-align:center">Robert Evans</p>
<p style="text-align:center">School of Technology & Engineering, National University</p>
<p style="text-align:center">DDS-8555: Predictive Analysis</p>
<p style="text-align:center">Dr. Mohammad Yavarimanesh</p>
<p style="text-align:center">February 16, 2025</p>

## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

## Load Data

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

## Data Preparation

In [3]:
# Define features and target variable
X = train.drop(columns=['id', 'NObeyesdad'])  # Exclude target column
y = train['NObeyesdad']  # Target column

In [4]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
# Create a column transformer for one-hot encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
])

# Create a pipeline with preprocessing and Support Vector Machine classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

## Build The Model

In [5]:
# Perform 80/20 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Train the model
pipeline.fit(X_train, y_train)

## Evaluate The Model

In [7]:
# Predict the testing dataset
y_pred = pipeline.predict(X_test)

In [8]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5754


In [9]:
print(classification_report(y_test, y_pred))

                     precision    recall  f1-score   support

Insufficient_Weight       0.62      0.52      0.57       524
      Normal_Weight       0.45      0.39      0.41       626
     Obesity_Type_I       0.45      0.52      0.48       543
    Obesity_Type_II       0.52      0.84      0.64       657
   Obesity_Type_III       0.81      1.00      0.89       804
 Overweight_Level_I       0.56      0.18      0.27       484
Overweight_Level_II       0.45      0.30      0.36       514

           accuracy                           0.58      4152
          macro avg       0.55      0.53      0.52      4152
       weighted avg       0.56      0.58      0.55      4152



## Competition Submission

In [10]:
test_X = test.drop(columns=['id'])

In [11]:
test_y = pipeline.predict(test_X)



In [12]:
submission = pd.DataFrame({'id':test['id'], 'NObeyesdad':test_y})

In [13]:
submission.to_csv('submission.csv', index=False)