In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Original data
data = {
    'Gender': [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
    'Age Range': [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2],
    'Head Size(cm^3)': [4512, 3738, 4261, 3777, 4177, 3585, 3785, 3559, 3613, 3982, 3443, 3993, 3640, 4208, 3832],
    'Brain Weight(grams)': [1530, 1297, 1335, 1282, 1590, 1300, 1400, 1255, 1355, 1375, 1340, 1380, 1355, 1522, 1208]
}

df = pd.DataFrame(data)

# Extracting numeric columns for scaling
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Min-Max scaling
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Display the normalized data
print(df)

    Gender  Age Range  Head Size(cm^3)  Brain Weight(grams)
0      0.0        0.0         1.000000             0.842932
1      0.0        0.0         0.275959             0.232984
2      0.0        0.0         0.765201             0.332461
3      0.0        0.0         0.312442             0.193717
4      0.0        0.0         0.686623             1.000000
5      0.0        0.0         0.132834             0.240838
6      0.0        0.0         0.319925             0.502618
7      0.0        0.0         0.108513             0.123037
8      0.0        1.0         0.159027             0.384817
9      1.0        1.0         0.504210             0.437173
10     1.0        1.0         0.000000             0.345550
11     1.0        1.0         0.514500             0.450262
12     1.0        1.0         0.184284             0.384817
13     1.0        1.0         0.715622             0.821990
14     1.0        1.0         0.363891             0.000000


In [12]:
import pandas as pd

# Load the dataset (replace 'path_to_file' with the actual path to the downloaded CSV file)
file_path ='/content/housing.csv'
housing_data = pd.read_csv(file_path)

# a) Describe the data
data_description = housing_data.describe()
print("Data Description:")
print(data_description)

# b) Find data type and shape of each column
data_info = housing_data.info()
print("\nData Types and Shape:")
print(data_info)

# c) Find null values and fill them with '0' or mean of the column
null_values = housing_data.isnull().sum()
print("\nNull Values:")
print(null_values)

# Assuming you want to fill null values with the mean of each column
housing_data_filled = housing_data.fillna(housing_data.mean())


Data Description:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000000       0.499900   
25

  housing_data_filled = housing_data.fillna(housing_data.mean())


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the dataset (replace 'path_to_file' with the actual path to the downloaded CSV file)
file_path = '/content/housing.csv'
housing_data = pd.read_csv(file_path)

# Identify non-numeric columns
non_numeric_columns = housing_data.select_dtypes(exclude=['float64', 'int64']).columns

# Separate features and target variable
features = housing_data.drop(columns=['median_house_value'])
target = housing_data['median_house_value']

# Separate numeric and non-numeric features
numeric_features = features.drop(columns=non_numeric_columns)
categorical_features = features[non_numeric_columns]

# Apply one-hot encoding to categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
categorical_features_encoded = pd.DataFrame(encoder.fit_transform(categorical_features), columns=encoder.get_feature_names_out(categorical_features.columns))

# Concatenate numeric and encoded categorical features
features_processed = pd.concat([numeric_features, categorical_features_encoded], axis=1)

# Normalize the data with min-max scaling
scaler = MinMaxScaler()
features_normalized = pd.DataFrame(scaler.fit_transform(features_processed), columns=features_processed.columns)

# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(features_normalized, target, test_size=0.2, random_state=42)

# Print information about the datasets
print("Features:")
print(features.head())
print("\nTarget Variable:")
print(target.head())
print("\nNormalized Features:")
print(features_normalized.head())
print("\nTrain and Test Shapes:")
print("X_train:", X_train.shape, "X_test:", X_test.shape, "y_train:", y_train.shape, "y_test:", y_test.shape)


Features:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income ocean_proximity  
0       322.0       126.0         8.3252        NEAR BAY  
1      2401.0      1138.0         8.3014        NEAR BAY  
2       496.0       177.0         7.2574        NEAR BAY  
3       558.0       219.0         5.6431        NEAR BAY  
4       565.0       259.0         3.8462        NEAR BAY  

Target Variable:
0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
Name: median_house_value, dtype: float64

Normalized Features:
   longitude  latitude  housing_me

