<a href="https://colab.research.google.com/github/sisiyotakele/gdsc_study_session_ML_g1/blob/main/Week_2/housePricePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Housing.csv to Housing.csv


# **Load data**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("Housing.csv")
df.head()

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None


In [None]:
df.isnull().sum()

Unnamed: 0,0
price,0
area,0
bedrooms,0
bathrooms,0
stories,0
mainroad,0
guestroom,0
basement,0
hotwaterheating,0
airconditioning,0


In [None]:
print(f"Dataset Shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")

Dataset Shape: (545, 13)
Number of rows: 545, Number of columns: 13


In [None]:
print(df.describe())

              price          area    bedrooms   bathrooms     stories  \
count  5.450000e+02    545.000000  545.000000  545.000000  545.000000   
mean   4.766729e+06   5150.541284    2.965138    1.286239    1.805505   
std    1.870440e+06   2170.141023    0.738064    0.502470    0.867492   
min    1.750000e+06   1650.000000    1.000000    1.000000    1.000000   
25%    3.430000e+06   3600.000000    2.000000    1.000000    1.000000   
50%    4.340000e+06   4600.000000    3.000000    1.000000    2.000000   
75%    5.740000e+06   6360.000000    3.000000    2.000000    2.000000   
max    1.330000e+07  16200.000000    6.000000    4.000000    4.000000   

          parking  
count  545.000000  
mean     0.693578  
std      0.861586  
min      0.000000  
25%      0.000000  
50%      0.000000  
75%      1.000000  
max      3.000000  


In [None]:
x = df[['area']]
y = df['price']

# **Split the data into training and testing sets**
## **80% for training , 20% for testing**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Data Split Information:")
print(f"Training set size: {x_train.shape[0]} samples")
print(f"Testing set size: {x_test.shape[0]} samples")


# **Create and train the Linear Regression model**

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)


# **Display model coefficients**

In [None]:
print(f"Intercept (bias): {model.intercept_:.2f}")
print(f"Coefficient for area: {model.coef_[0]:.2f}")

# **Make predictions on the test**

In [None]:
results_df = pd.DataFrame({
    'Actual Price': y_test.values,
    'Predicted Price': y_pred,
    'Area': x_test.values.flatten()
})
print(results_df.head(10))

# **Calculate evaluation metrics**
## **Measure performance**

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")


# **R-squared score (coefficient of determination)**

In [None]:
r_squared = model.score(X_test, y_test)
print(f"R-squared Score: {r_squared:.4f}")

# **# Visualization 1: Regression Line**

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', alpha=0.5, label='Actual Prices')
plt.scatter(X_test, y_pred, color='red', alpha=0.5, label='Predicted Prices')
plt.plot(X_test, y_pred, color='green', linewidth=2, label='Regression Line')
plt.xlabel('Area (sq ft)')
plt.ylabel('Price')
plt.title('House Price Prediction: Area vs Price')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('regression_line.png')  # Save the plot
plt.show()

# **Visualization 2: Residuals Plot**

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Prices')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residuals Plot')
plt.grid(True, alpha=0.3)
plt.savefig('residuals_plot.png')  # Save the plot
plt.show()

# **Visualization 3: Distribution of Errors**

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.title('Distribution of Prediction Errors')
plt.grid(True, alpha=0.3)
plt.savefig('error_distribution.png')  # Save the plot
plt.show()


# **Make a prediction for a new house**

In [None]:
new_area = 8000  # Example: 8000 sq ft house
new_price = model.predict([[new_area]])
print(f"Prediction for a house with {new_area} sq ft area:")
print(f"Predicted Price: ${new_price[0]:,.2f}")