In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.neural_network import MLPClassifier

In [None]:
# 1. Load and Explore the Dataset
data = pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/train.csv')
# Replace with your dataset filename
print("Dataset Head:\n", data.head())
print("\nDataset Info:\n")
data.info()

Dataset Head:
    ID   Age  Gender  Height  Weight        CALC FAVC  FCVC  NCP  SCC SMOKE  \
0   1  21.0  Female    1.62    64.0          no   no   2.0  3.0   no    no   
1   2  21.0     NaN    1.52    56.0   Sometimes   no   3.0  3.0  yes   yes   
2   3   NaN    Male     NaN    77.0  Frequently   no   2.0  3.0   no    no   
3   4  27.0     NaN     NaN    87.0  Frequently   no   3.0  3.0   no   NaN   
4   5  22.0    Male    1.78    89.8   Sometimes   no   2.0  1.0   no    no   

   CH2O family_history_with_overweight  FAF  TUE       CAEC  \
0   2.0                            yes  0.0  1.0  Sometimes   
1   3.0                            yes  3.0  NaN  Sometimes   
2   2.0                            yes  2.0  NaN  Sometimes   
3   2.0                            NaN  2.0  0.0        NaN   
4   2.0                             no  0.0  0.0  Sometimes   

                  MTRANS           NObeyesdad  
0  Public_Transportation        Normal_Weight  
1  Public_Transportation        Normal_We

In [None]:
# 2. Data Preprocessing
# Handle missing values
# Select only numerical features for imputation
numerical_features = data.select_dtypes(include=np.number).columns

# Apply imputation to numerical features
imputer = SimpleImputer(strategy='mean')
data[numerical_features] = imputer.fit_transform(data[numerical_features])


In [None]:
# Now proceed with encoding categorical variables
label_encoders = {}
for name in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[name] = le.fit_transform(data[name])
    label_encoders[name] = le

In [None]:
# Standardize numerical features
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

print("\nProcessed Data Sample:\n", data_scaled.head())


Processed Data Sample:
          ID       Age    Gender        Height    Weight      CALC      FAVC  \
0 -1.730491 -0.345205 -1.211866 -7.843689e-01 -0.286452  0.951236 -2.030962   
1 -1.727370 -0.345205  1.265430 -1.947196e+00 -0.860371 -0.561475 -2.030962   
2 -1.724249  0.000000  0.026782  2.581996e-15  0.646165 -2.074186 -2.030962   
3 -1.721129  0.762081  1.265430  2.581996e-15  1.363563 -2.074186 -2.030962   
4 -1.718008 -0.160658  0.026782  1.076155e+00  1.564434 -0.561475 -2.030962   

       FCVC       NCP       SCC     SMOKE      CH2O  \
0 -0.617439  0.369811 -0.307692 -0.680188  0.073697   
1  1.200034  0.369811  3.250000  0.414027  1.658611   
2 -0.617439  0.369811 -0.307692 -0.680188  0.073697   
3  1.200034  0.369811 -0.307692  1.508242  0.073697   
4 -0.617439 -1.842067 -0.307692 -0.680188  0.073697   

   family_history_with_overweight       FAF       TUE      CAEC    MTRANS  \
0                        0.335919 -1.232457  0.499487 -0.345632  0.131353   
1              

In [None]:
# 3. Data Visualization
# Calculate statistics
mean_vals = data_scaled.mean()
median_vals = data_scaled.median()
std_dev_vals = data_scaled.std()
var_vals = data_scaled.var()
print("\nStatistics:\nMean:\n", mean_vals)
print("\nMedian:\n", median_vals)
print("\nStandard Deviation:\n", std_dev_vals)
print("\nVariance:\n", var_vals)


Statistics:
Mean:
 ID                                0.000000e+00
Age                               3.712746e-16
Gender                           -1.280257e-17
Height                            2.093220e-15
Weight                           -2.816566e-16
CALC                              5.921189e-17
FAVC                             -3.200643e-17
FCVC                             -4.160836e-17
NCP                               5.505106e-16
SCC                              -3.200643e-17
SMOKE                             4.800964e-17
CH2O                             -1.440289e-16
family_history_with_overweight    3.840772e-17
FAF                               1.280257e-17
TUE                              -2.560514e-17
CAEC                             -1.664334e-16
MTRANS                           -5.761157e-17
NObeyesdad                        5.121029e-17
dtype: float64

Median:
 ID                                0.000000e+00
Age                               0.000000e+00
Gender         

In [None]:
# 5. Classification
# Assuming 'NObeyesdad' is the target column based on later code
X = data_scaled.drop(columns=['NObeyesdad'])
y = data_scaled['NObeyesdad']

# Convert y to discrete values if it's continuous
if pd.api.types.is_numeric_dtype(y):
    # If continuous, convert to discrete using a threshold or binning strategy
    # For example, using a threshold of 0:
    y = (y > 0).astype(int)  # Values above 0 become 1, others become 0
    # Alternatively, use pd.qcut for binning into quantiles

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 5. Classification
# Check the actual column names in your DataFrame
print(data_scaled.columns)

# Assuming the target column is named 'NObeyesdad', adjust accordingly
X = data_scaled.drop(columns=['NObeyesdad'])
# Replace 'target' with the actual target column name
y = data_scaled['NObeyesdad']

# Convert y to discrete values if it's continuous
# Check if y has continuous values
if pd.api.types.is_numeric_dtype(y):
    # If continuous, convert to discrete using a threshold or binning strategy
    # For example, using a threshold of 0:
    y = (y > 0).astype(int)  # Values above 0 become 1, others become 0
    # Alternatively, use pd.qcut for binning into quantiles

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Index(['ID', 'Age', 'Gender', 'Height', 'Weight', 'CALC', 'FAVC', 'FCVC',
       'NCP', 'SCC', 'SMOKE', 'CH2O', 'family_history_with_overweight', 'FAF',
       'TUE', 'CAEC', 'MTRANS', 'NObeyesdad'],
      dtype='object')


In [None]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_preds))


Decision Tree Accuracy: 0.8198198198198198


In [None]:
y_test# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))


Logistic Regression Accuracy: 0.8603603603603603


In [None]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))


Random Forest Accuracy: 0.9144144144144144


In [None]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_preds = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))


Naive Bayes Accuracy: 0.8153153153153153


In [None]:
# Cross Validation
cv_scores = cross_val_score(LogisticRegression(), X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Average Cross-Validation Score:", np.mean(cv_scores))

Cross-Validation Scores: [0.58108108 0.81531532 0.8963964  0.94594595 0.9009009 ]
Average Cross-Validation Score: 0.8279279279279279


In [None]:
# 7. Neural Network
nn = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42)
nn.fit(X_train, y_train)
nn_preds = nn.predict(X_test)
print("Neural Network Accuracy:", accuracy_score(y_test, nn_preds))
print("\nClassification Report:\n", classification_report(y_test, nn_preds))


Neural Network Accuracy: 0.8468468468468469

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87       129
           1       0.82      0.81      0.82        93

    accuracy                           0.85       222
   macro avg       0.84      0.84      0.84       222
weighted avg       0.85      0.85      0.85       222



In [None]:
# Result Analysis
results = {
    "Model": ["Decision Tree", "Logistic Regression", "Random Forest", "Naive Bayes", "Neural Network"], # Removed 'SVM' from Model list
    "Accuracy": [
        accuracy_score(y_test, dt_preds),
        accuracy_score(y_test, lr_preds),
        accuracy_score(y_test, rf_preds),
        accuracy_score(y_test, nb_preds),
        accuracy_score(y_test, nn_preds),
    ]
}
results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:\n", results_df)


Model Performance Comparison:
                  Model  Accuracy
0        Decision Tree  0.819820
1  Logistic Regression  0.860360
2        Random Forest  0.914414
3          Naive Bayes  0.815315
4       Neural Network  0.846847


In [None]:
import pandas as pd
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest Model:\n", best_model)


Best Model:
 Model       Random Forest
Accuracy         0.914414
Name: 2, dtype: object


In [None]:
# prompt: load new csv file

import pandas as pd
new_data = "pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/test.csv')"

In [None]:
# prompt: prepocessing and null value delete this dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/train.csv')

# Handle missing values (numerical features)
numerical_cols = data.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# Encode categorical features
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

X = data_scaled.drop('NObeyesdad', axis=1)
y = data_scaled['NObeyesdad']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# prompt: implement the random forest model in this new Dataset here and find out the accuraccy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Random Forest (already present in the provided code)
rf = RandomForestClassifier()

# Convert y_train to discrete values if it's continuous
if pd.api.types.is_numeric_dtype(y_train):
    # If continuous, convert to discrete using a threshold or binning strategy
    # For example, using a threshold of 0:
    y_train = (y_train > 0).astype(int)

In [None]:
# prompt: implement the random forest model in this new Dataset  here and find out the accuraccy

# Random Forest (already present in the provided code)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# Convert y_test to discrete values if it's continuous, similar to y_train
if pd.api.types.is_numeric_dtype(y_test):
    y_test = (y_test > 0).astype(int)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))

Random Forest Accuracy: 0.9144144144144144


In [None]:
# prompt: now preddiction the dataset model

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the training dataset
train_data = pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/train.csv')

# Load the test dataset
test_data = pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/test.csv')

# Preprocessing for training data (same as before)
numerical_cols = train_data.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
train_data[numerical_cols] = imputer.fit_transform(train_data[numerical_cols])

categorical_cols = train_data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le

#Fit the scaler only on the features used for training (excluding the target variable)
scaler = StandardScaler()
X_train = train_data.drop('NObeyesdad', axis=1)  # Drop the target variable before scaling
scaler.fit(X_train) #Fit the scaler on the training features
train_data_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns) # Transform the training data
# Extract the target variable after scaling
y_train = train_data['NObeyesdad']


# Convert y_train to discrete values if it's continuous
if pd.api.types.is_numeric_dtype(y_train):
    y_train = (y_train > 0).astype(int)

# Preprocessing for test data (using the same transformers from training)
# Important: Apply the same transformations to the test data as the training data
# Apply imputation to all numerical features at once
test_data[numerical_cols] = imputer.transform(test_data[numerical_cols]) # Impute all numerical columns

# Exclude 'NObeyesdad' from categorical_cols when processing test data
categorical_cols_test = [col for col in categorical_cols if col in test_data.columns]

for col in categorical_cols_test:  # Use categorical_cols_test here
    # Handle potential unseen values in the test set
    # If a category is not present in the training data, replace with '<unknown>'
    test_data[col] = test_data[col].map(lambda s: '<unknown>' if s not in label_encoders[col].classes_ else s)
    # Add the '<unknown>' category to the label encoder if it's not already there
    label_encoders[col].classes_ = np.append(label_encoders[col].classes_, '<unknown>')
    test_data[col] = label_encoders[col].transform(test_data[col])

#Transform the test data using the same scaler fitted on the training features
test_data_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns) # Use transform, not fit_transform
X_test = test_data_scaled.drop('NObeyesdad', axis=1, errors='ignore') # Assuming 'NObeyesdad' is also in the test set. If not, remove this line and adjust X_test. Ignore if the column is not present


# Train the model
rf = RandomForestClassifier()
rf.fit(X_train, y_train) #Train on the original training data

# Make predictions on

In [None]:
# prompt: now make a csv file on preddicted test dataset which i find the accuracy with the random forest model implement now give me a csv dataset which have  two row and column ID an NObeyesdad

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the training dataset
train_data = pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/train.csv')

# Load the test dataset
test_data = pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/test.csv')

# Preprocessing for training data
numerical_cols = train_data.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
train_data[numerical_cols] = imputer.fit_transform(train_data[numerical_cols])

categorical_cols = train_data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le

scaler = StandardScaler()
X_train = train_data.drop('NObeyesdad', axis=1)
scaler.fit(X_train)
train_data_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
y_train = train_data['NObeyesdad']

if pd.api.types.is_numeric_dtype(y_train):
    y_train = (y_train > 0).astype(int)

# Preprocessing for test data
test_data[numerical_cols] = imputer.transform(test_data[numerical_cols])

categorical_cols_test = [col for col in categorical_cols if col in test_data.columns]
for col in categorical_cols_test:
    test_data[col] = test_data[col].map(lambda s: '<unknown>' if s not in label_encoders[col].classes_ else s)
    label_encoders[col].classes_ = np.append(label_encoders[col].classes_, '<unknown>')
    test_data[col] = label_encoders[col].transform(test_data[col])

test_data_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)
X_test = test_data_scaled.drop('NObeyesdad', axis=1, errors='ignore')

# Train the model
rf = RandomForestClassifier()
rf.fit(train_data_scaled, y_train)

# Make predictions
rf_preds = rf.predict(X_test)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({'ID': test_data.index, 'NObeyesdad': rf_preds})

# Save the predictions to a CSV file
predictions_df.to_csv('predicted_test_data.csv', index=True)

In [None]:
preddiction = RandomForestClassifier(test_data)

In [None]:
preddiction

In [1]:
# prompt: create a submission dataset

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the training dataset
train_data = pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/train.csv')

# Load the test dataset
test_data = pd.read_csv('/content/drive/MyDrive/obesity-level-prediction-fall-2024/test.csv')

# Preprocessing for training data
numerical_cols = train_data.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
train_data[numerical_cols] = imputer.fit_transform(train_data[numerical_cols])

categorical_cols = train_data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le

scaler = StandardScaler()
X_train = train_data.drop('NObeyesdad', axis=1)
scaler.fit(X_train)
train_data_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
y_train = train_data['NObeyesdad']

if pd.api.types.is_numeric_dtype(y_train):
    y_train = (y_train > 0).astype(int)

# Preprocessing for test data
test_data[numerical_cols] = imputer.transform(test_data[numerical_cols])

categorical_cols_test = [col for col in categorical_cols if col in test_data.columns]
for col in categorical_cols_test:
    test_data[col] = test_data[col].map(lambda s: '<unknown>' if s not in label_encoders[col].classes_ else s)
    label_encoders[col].classes_ = np.append(label_encoders[col].classes_, '<unknown>')
    test_data[col] = label_encoders[col].transform(test_data[col])

test_data_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)
X_test = test_data_scaled

# Train the model
rf = RandomForestClassifier()
rf.fit(train_data_scaled, y_train)

# Make predictions
rf_preds = rf.predict(X_test)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({'ID': test_data.index, 'NObeyesdad': rf_preds})

# Save the predictions to a CSV file
predictions_df.to_csv('Submittion_dataset.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/obesity-level-prediction-fall-2024/train.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')