<a href="https://colab.research.google.com/github/sudeeksha03/EDA-obesity-lifestyle/blob/main/data_transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data transformation**

In [1]:
# Uploading and reading the file
from google.colab import files
import pandas as pd

uploaded = files.upload()
for filename in uploaded:
    df = pd.read_csv(filename)
    print(f"{filename} loaded successfully.")
    display(df.head())

# Encode categorical columns
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Normalize numerical features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df.head()


Saving Obesity prediction.csv to Obesity prediction.csv
Obesity prediction.csv loaded successfully.


Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,0.0,0.148936,0.320755,0.186567,1.0,0.0,0.5,0.666667,0.666667,0.0,0.5,0.0,0.0,0.5,1.0,0.75,0.166667
1,0.0,0.148936,0.132075,0.126866,1.0,0.0,1.0,0.666667,0.666667,1.0,1.0,1.0,1.0,0.0,0.666667,0.75,0.166667
2,1.0,0.191489,0.660377,0.283582,1.0,0.0,0.5,0.666667,0.666667,0.0,0.5,0.0,0.666667,0.5,0.333333,0.75,0.166667
3,1.0,0.276596,0.660377,0.358209,0.0,0.0,1.0,0.666667,0.666667,0.0,0.5,0.0,0.666667,0.0,0.333333,1.0,0.833333
4,1.0,0.170213,0.622642,0.379104,0.0,0.0,0.5,0.0,0.666667,0.0,0.5,0.0,0.0,0.0,0.666667,0.75,1.0


 Understand Data Types & Nulls

In [2]:
# Basic info
df.info()

# Check for null/missing values
print("\nMissing Values:")
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          2111 non-null   float64
 1   Age             2111 non-null   float64
 2   Height          2111 non-null   float64
 3   Weight          2111 non-null   float64
 4   family_history  2111 non-null   float64
 5   FAVC            2111 non-null   float64
 6   FCVC            2111 non-null   float64
 7   NCP             2111 non-null   float64
 8   CAEC            2111 non-null   float64
 9   SMOKE           2111 non-null   float64
 10  CH2O            2111 non-null   float64
 11  SCC             2111 non-null   float64
 12  FAF             2111 non-null   float64
 13  TUE             2111 non-null   float64
 14  CALC            2111 non-null   float64
 15  MTRANS          2111 non-null   float64
 16  Obesity         2111 non-null   float64
dtypes: float64(17)
memory usage: 280.

 Rename Columns (if needed for consistency)

In [3]:
# Optional: Rename columns to lower_snake_case
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
display(df.columns)


Index(['gender', 'age', 'height', 'weight', 'family_history', 'favc', 'fcvc',
       'ncp', 'caec', 'smoke', 'ch2o', 'scc', 'faf', 'tue', 'calc', 'mtrans',
       'obesity'],
      dtype='object')

Drop Irrelevant Columns

In [4]:
# Example: If any ID or unnamed columns are not needed
df = df.loc[:, ~df.columns.str.contains('^unnamed', case=False)]


Encode Categorical Variables

In [5]:
# Find categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
print("Categorical Columns:", cat_cols.tolist())

# Example: Encode using one-hot or label encoding
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
display(df_encoded.head())


Categorical Columns: []


Unnamed: 0,gender,age,height,weight,family_history,favc,fcvc,ncp,caec,smoke,ch2o,scc,faf,tue,calc,mtrans,obesity
0,0.0,0.148936,0.320755,0.186567,1.0,0.0,0.5,0.666667,0.666667,0.0,0.5,0.0,0.0,0.5,1.0,0.75,0.166667
1,0.0,0.148936,0.132075,0.126866,1.0,0.0,1.0,0.666667,0.666667,1.0,1.0,1.0,1.0,0.0,0.666667,0.75,0.166667
2,1.0,0.191489,0.660377,0.283582,1.0,0.0,0.5,0.666667,0.666667,0.0,0.5,0.0,0.666667,0.5,0.333333,0.75,0.166667
3,1.0,0.276596,0.660377,0.358209,0.0,0.0,1.0,0.666667,0.666667,0.0,0.5,0.0,0.666667,0.0,0.333333,1.0,0.833333
4,1.0,0.170213,0.622642,0.379104,0.0,0.0,0.5,0.0,0.666667,0.0,0.5,0.0,0.0,0.0,0.666667,0.75,1.0


Handle Outliers or Scale Features

In [7]:
# Check numeric features
import numpy as np
num_cols = df_encoded.select_dtypes(include=[np.number]).columns

# Scale numeric features using StandardScaler (if required)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

display(df_encoded.head())

Unnamed: 0,gender,age,height,weight,family_history,favc,fcvc,ncp,caec,smoke,ch2o,scc,faf,tue,calc,mtrans,obesity
0,-1.011914,-0.522124,-0.875589,-0.862558,0.472291,-2.759769,-0.785019,0.404153,0.300346,-0.1459,-0.013073,-0.218272,-1.188039,0.561997,1.419172,0.503337,-1.032796
1,-1.011914,-0.522124,-1.947599,-1.168077,0.472291,-2.759769,1.088342,0.404153,0.300346,6.853997,1.618759,4.581439,2.33975,-1.080625,-0.52116,0.503337,-1.032796
2,0.988227,-0.206889,1.054029,-0.36609,0.472291,-2.759769,-0.785019,0.404153,0.300346,-0.1459,-0.013073,-0.218272,1.16382,0.561997,-2.461491,0.503337,-1.032796
3,0.988227,0.423582,1.054029,0.015808,-2.117337,-2.759769,1.088342,0.404153,0.300346,-0.1459,-0.013073,-0.218272,1.16382,-1.080625,-2.461491,1.29628,1.016776
4,0.988227,-0.364507,0.839627,0.12274,-2.117337,-2.759769,-0.785019,-2.167023,0.300346,-0.1459,-0.013073,-0.218272,-1.188039,-1.080625,-0.52116,0.503337,1.529168


Save the Transformed Dataset

In [9]:
# Save the transformed DataFrame to CSV
df_encoded.to_csv('transformed_obesity_dataset.csv', index=False)
print("✅ Transformed data saved as 'transformed_obesity_dataset.csv'")


✅ Transformed data saved as 'transformed_obesity_dataset.csv'
