<h1 style="text-align:center">Build and Evaluate Classification Models</h1>
<h2 style="text-align:center">Kaggle Submission- Quadratic Discriminant Analysis</h2>
<p style="text-align:center">Robert Evans</p>
<p style="text-align:center">School of Technology & Engineering, National University</p>
<p style="text-align:center">DDS-8555: Predictive Analysis</p>
<p style="text-align:center">Dr. Mohammad Yavarimanesh</p>
<p style="text-align:center">February 9, 2025</p>

## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

## Import Data

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

## Data Exploration

In [3]:
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation
1,20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation
2,20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation
3,20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation
4,20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation


## Data Preparation

In [5]:
# Define features and target variable
X = train.drop(columns=['id', 'NObeyesdad'])  # Exclude target column
y = train['NObeyesdad']  # Target column

In [6]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create a column transformer for one-hot encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

# Create a pipeline with preprocessing and QDA
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', QuadraticDiscriminantAnalysis())
])

## Build The Model

In [7]:
# Perform 80/20 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train the model
pipeline.fit(X_train, y_train)



## Evaluate the Model

In [9]:
# Predict the testing dataset
y_pred = pipeline.predict(X_test)

In [10]:
# Generate and display the summary report
print("Classification Report:")
print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.72      0.94      0.82       524
      Normal_Weight       0.76      0.50      0.60       626
     Obesity_Type_I       0.21      0.58      0.31       543
    Obesity_Type_II       0.76      0.97      0.86       657
   Obesity_Type_III       0.00      0.00      0.00       804
 Overweight_Level_I       0.72      0.30      0.43       484
Overweight_Level_II       0.49      0.51      0.50       514

           accuracy                           0.52      4152
          macro avg       0.52      0.54      0.50      4152
       weighted avg       0.50      0.52      0.48      4152



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Display accuracy score
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Accuracy Score: 0.5214354527938343


## Competition Submission

In [12]:
test_X = test.drop(columns=['id'])

In [13]:
test_y = pipeline.predict(test_X)



In [14]:
submission = pd.DataFrame({'id':test['id'], 'NObeyesdad':test_y})

In [15]:
submission.to_csv('submission.csv', index=False)