In [None]:
import pandas as pd
import io

df = pd.read_csv('/content/DATAHOME (1).CSV')
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
ID            0
location      0
size(sqft)    1
Room          1
AGE(yrs)      1
Price(INR)    1
dtype: int64


In [None]:
df['Price(INR)'] = df['Price(INR)'].astype(float)

# Convert 'Room' column to numeric, coercing errors
df['Room'] = pd.to_numeric(df['Room'], errors='coerce')

numerical_cols = ['size(sqft)', 'Room', 'AGE(yrs)', 'Price(INR)']
for col in numerical_cols:
  # Check if the column is numeric before calculating median and filling NaNs
  if pd.api.types.is_numeric_dtype(df[col]):
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)
  else:
    print(f"Warning: Column '{col}' is not numeric and will not be imputed with median.")


print("Missing values after imputation:")
print(df.isnull().sum())
display(df)

Missing values after imputation:
ID            0
location      0
size(sqft)    0
Room          0
AGE(yrs)      0
Price(INR)    0
dtype: int64


Unnamed: 0,ID,location,size(sqft),Room,AGE(yrs),Price(INR)
0,1,Downdown,1200.0,3.0,10.0,5000000.0
1,2,Suburb,1500.0,4.0,5.0,6500000.0
2,3,Uptown,1000.0,2.0,20.0,3800000.0
3,4,Suburb,2000.0,5.0,10.0,7200000.0
4,5,Downdown,850.0,3.0,15.0,4200000.0
5,6,Uptown,1200.0,3.0,12.0,5600000.0
6,7,Suburb,1100.0,3.0,7.0,4800000.0
7,8,Downdown,1750.0,4.0,3.0,7000000.0
8,9,Uptown,950.0,2.0,25.0,3500000.0
9,10,Suburb,1600.0,4.0,6.0,5000000.0


In [None]:
duplicate_rows = df[df.duplicated()]
print("Duplicate rows found:")
display(duplicate_rows)


df_cleaned = df.drop_duplicates()
print("\nDataFrame after removing duplicate rows:")
display(df_cleaned)

Duplicate rows found:


Unnamed: 0,ID,location,size(sqft),Room,AGE(yrs),Price(INR)



DataFrame after removing duplicate rows:


Unnamed: 0,ID,location,size(sqft),Room,AGE(yrs),Price(INR)
0,1,Downdown,1200.0,3.0,10.0,5000000.0
1,2,Suburb,1500.0,4.0,5.0,6500000.0
2,3,Uptown,1000.0,2.0,20.0,3800000.0
3,4,Suburb,2000.0,5.0,10.0,7200000.0
4,5,Downdown,850.0,3.0,15.0,4200000.0
5,6,Uptown,1200.0,3.0,12.0,5600000.0
6,7,Suburb,1100.0,3.0,7.0,4800000.0
7,8,Downdown,1750.0,4.0,3.0,7000000.0
8,9,Uptown,950.0,2.0,25.0,3500000.0
9,10,Suburb,1600.0,4.0,6.0,5000000.0


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Select the numerical columns to normalize
numerical_cols_to_normalize = ['size(sqft)', 'AGE(yrs)']

# Apply Min-Max scaling to the selected columns
df_cleaned[numerical_cols_to_normalize] = scaler.fit_transform(df_cleaned[numerical_cols_to_normalize])

print("DataFrame after Min-Max scaling:")
display(df_cleaned)

DataFrame after Min-Max scaling:


Unnamed: 0,ID,location,size(sqft),Room,AGE(yrs),Price(INR)
0,1,Downdown,0.304348,3.0,0.318182,5000000.0
1,2,Suburb,0.565217,4.0,0.090909,6500000.0
2,3,Uptown,0.130435,2.0,0.772727,3800000.0
3,4,Suburb,1.0,5.0,0.318182,7200000.0
4,5,Downdown,0.0,3.0,0.545455,4200000.0
5,6,Uptown,0.304348,3.0,0.409091,5600000.0
6,7,Suburb,0.217391,3.0,0.181818,4800000.0
7,8,Downdown,0.782609,4.0,0.0,7000000.0
8,9,Uptown,0.086957,2.0,1.0,3500000.0
9,10,Suburb,0.652174,4.0,0.136364,5000000.0


In [None]:
df_cleaned['Price_per_SqFt'] = df_cleaned['Price(INR)'] / df_cleaned['size(sqft)']

print("\nDataFrame with new 'Price_per_SqFt' feature:")
display(df_cleaned)


DataFrame with new 'Price_per_SqFt' feature:


Unnamed: 0,ID,location,size(sqft),Room,AGE(yrs),Price(INR),Price_per_SqFt
0,1,Downdown,0.304348,3.0,0.318182,5000000.0,16428570.0
1,2,Suburb,0.565217,4.0,0.090909,6500000.0,11500000.0
2,3,Uptown,0.130435,2.0,0.772727,3800000.0,29133330.0
3,4,Suburb,1.0,5.0,0.318182,7200000.0,7200000.0
4,5,Downdown,0.0,3.0,0.545455,4200000.0,inf
5,6,Uptown,0.304348,3.0,0.409091,5600000.0,18400000.0
6,7,Suburb,0.217391,3.0,0.181818,4800000.0,22080000.0
7,8,Downdown,0.782609,4.0,0.0,7000000.0,8944444.0
8,9,Uptown,0.086957,2.0,1.0,3500000.0,40250000.0
9,10,Suburb,0.652174,4.0,0.136364,5000000.0,7666667.0


In [None]:
df_encoded = pd.get_dummies(df_cleaned, columns=['location'], dtype=int)

print("\nDataFrame after one-hot encoding:")
display(df_encoded)


DataFrame after one-hot encoding:


Unnamed: 0,ID,size(sqft),Room,AGE(yrs),Price(INR),Price_per_SqFt,location_Downdown,location_Suburb,location_Uptown
0,1,0.304348,3.0,0.318182,5000000.0,16428570.0,1,0,0
1,2,0.565217,4.0,0.090909,6500000.0,11500000.0,0,1,0
2,3,0.130435,2.0,0.772727,3800000.0,29133330.0,0,0,1
3,4,1.0,5.0,0.318182,7200000.0,7200000.0,0,1,0
4,5,0.0,3.0,0.545455,4200000.0,inf,1,0,0
5,6,0.304348,3.0,0.409091,5600000.0,18400000.0,0,0,1
6,7,0.217391,3.0,0.181818,4800000.0,22080000.0,0,1,0
7,8,0.782609,4.0,0.0,7000000.0,8944444.0,1,0,0
8,9,0.086957,2.0,1.0,3500000.0,40250000.0,0,0,1
9,10,0.652174,4.0,0.136364,5000000.0,7666667.0,0,1,0
