In [4]:
import pandas as pd

# i. Load .csv file into the DataFrame
housing_df = pd.read_csv("housing.csv")

# ii. Display information of all columns
print("Information of all columns:")
print(housing_df.info())

# iii. Display statistical information of all numerical columns
print("\nStatistical information of numerical columns:")
print(housing_df.describe())

# iv. Display the count of unique labels for "Ocean Proximity" column
print("\nCount of unique labels for 'Ocean Proximity' column:")
#print(housing_df['ocean_proximity'].value_counts())

# v. Display which attributes (columns) in the dataset have missing values count greater than zero
print("\nColumns with missing values:")
missing_values = housing_df.isnull().sum()
print(missing_values[missing_values > 0])



Information of all columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB
None

Statistical information of numerical columns:
       Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
count       5000.000000          5000.000000                5000.000000   
mean       68583.108984             5.977222                   6.987792   
std 

In [5]:
import pandas as pd
import numpy as np

# Load the datasets
diabetes_df = pd.read_csv("/content/Dataset of Diabetes  (1).csv")
adult_income_df = pd.read_csv("/content/adult.csv")

# Check for missing values
print("Missing values in Diabetes dataset:")
print(diabetes_df.isnull().sum())

print("\nMissing values in Adult Income dataset:")
print(adult_income_df.isnull().sum())

# Handle missing values by filling with the median for numerical columns ONLY
# Select only numeric columns for median imputation
numeric_cols_diabetes = diabetes_df.select_dtypes(include=np.number).columns # Changed pd.np.number to np.number
numeric_cols_adult = adult_income_df.select_dtypes(include=np.number).columns # Changed pd.np.number to np.number
#The above 2 lines were changed to select only the numerical columns for imputation

diabetes_df[numeric_cols_diabetes] = diabetes_df[numeric_cols_diabetes].fillna(diabetes_df[numeric_cols_diabetes].median())
adult_income_df[numeric_cols_adult] = adult_income_df[numeric_cols_adult].fillna(adult_income_df[numeric_cols_adult].median())

Missing values in Diabetes dataset:
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64

Missing values in Adult Income dataset:
age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
diabetes_df = pd.read_csv("/content/Dataset of Diabetes  (1).csv")

# Check the first few rows of the dataset
print("Diabetes dataset (first few rows):")
print(diabetes_df.head())

# Step 1: Categorize the 'BMI' column into bins (Low, Normal, High)
bins = [0, 18.5, 24.9, 40]  # BMI categories: Low (<18.5), Normal (18.5-24.9), High (>24.9)
labels = ['Low', 'Normal', 'High']
diabetes_df['BMI_Category'] = pd.cut(diabetes_df['BMI'], bins=bins, labels=labels, right=False)

# Step 2: One-Hot Encoding for 'BMI_Category'
diabetes_df = pd.get_dummies(diabetes_df, columns=['BMI_Category'])

# Step 3: Encoding 'Gender' column (if present)
if 'Gender' in diabetes_df.columns:
    label_encoder = LabelEncoder()
    diabetes_df['Gender'] = label_encoder.fit_transform(diabetes_df['Gender'])

# Display the modified dataset
print("\nDiabetes dataset after encoding:")
print(diabetes_df.head())

Diabetes dataset (first few rows):
    ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  

Diabetes dataset after encoding:
    ID  No_Pation  Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975       0   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221       1   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975       0   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656       0   50   4.7  46    4.9   4.2  0.9  