**Importing** **Necessary** **Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

**Loading** **Dataset**

In [None]:
data = pd.read_csv('/content/Bengaluru_House_Data.csv')

In [None]:
#returns data of first n rows
data.head()

In [None]:
#returns no. of elements(no. of rows nd columns)
data.shape

In [None]:
#provides short summary of the df
data.info()

**Data** **Preprocessing** **Steps**

In [None]:
# Drop columns that are irrelevant or have too many missing values
data = data.drop(['area_type', 'availability', 'society'], axis=1)

In [None]:
# Drop rows with missing values for critical columns
data = data.dropna(subset=['location', 'size', 'total_sqft', 'bath', 'price'])

In [None]:
data.describe()

In [None]:
# Extract BHK information from 'size' column
data['BHK'] = data['size'].apply(lambda x: int(x.split(' ')[0]))

In [None]:
# Convert 'total_sqft' to numeric by handling ranges or converting irregular data
def convert_sqft_to_num(x):
    try:
        return float(x)
    except:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        else:
            return None

data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)

In [None]:
# Drop rows with NaN values in total_sqft after conversion
data = data.dropna(subset=['total_sqft'])

**One** **Hot** **Encoder**

In [None]:
# One-hot encoding for 'location' as it's a categorical variable
dummies = pd.get_dummies(data['location'], drop_first=True)

# Concatenate the dummy variables with the dataset
data = pd.concat([data, dummies], axis=1)

# Drop 'location' and 'size' columns as they're no longer needed
data = data.drop(['location', 'size'], axis=1)


In [None]:
# Defining feature matrix (X) and target variable (y)
X = data.drop('price', axis=1)
y = data['price']


**Visualisation**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Price distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['price'], kde=True, color='blue')
plt.title('Distribution of House Prices')
plt.xlabel('Price (Lakhs)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Scatter plot for total_sqft vs price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='total_sqft', y='price', data=data, hue='BHK', palette='cool')
plt.title('Total Square Feet vs Price')
plt.xlabel('Total Square Feet')
plt.ylabel('Price (Lakhs)')
plt.show()

**Train-Test Split**

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Feature scaling**

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# Create an imputer to replace NaN with the mean of the column
imputer = SimpleImputer(strategy='mean')  # You can use other strategies like 'median' or 'most_frequent'

# Fit the imputer on the training data and transform both training and testing data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


**Linear Regression model**

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = model.predict(X_test)

**Model evaluation**

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")