In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading our data
data = pd.read_csv("./dataset/Real estate.csv")
print(data.columns)

In [None]:
# Step 3: Data preprocessing
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:")
print(missing_values)

In [None]:
# Check data types of columns
data_types = data.dtypes
print("\nData types:")
print(data_types)

In [None]:
# Splitting features & target variable
X = data.drop(columns=['No', 'Y house price of unit area'])
Y = data['Y house price of unit area']

In [None]:
# Step 4: Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Model training
model = LinearRegression()
model.fit(X_train, Y_train)

In [None]:
# Step 6: Model evaluation
Y_pred = model.predict(X_test)

mse = mean_squared_error(Y_test, Y_pred)
mae = mean_absolute_error(Y_test, Y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)

In [None]:
# Visualize predicted vs actual house prices
plt.figure(figsize=(10, 6))
plt.scatter(Y_test, Y_pred, alpha=0.5)
plt.xlabel("Actual House Prices")
plt.ylabel("Predicted House Prices")
plt.title("Actual vs Predicted House Prices")
plt.show()

In [None]:
# Feature importances
importances = pd.Series(model.coef_, index=X.columns)
importances_sorted = importances.abs().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
importances_sorted.plot(kind='barh')
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Feature Importances")
plt.show()

In [None]:
# Step 7: Interpretation
coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': model.coef_})
print("\nCoefficients:")
print(coefficients)