In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# A helper function to detect and load a file in Google Colab
def load_data_from_colab():
    """
    Prompts the user to upload a CSV file and loads it as a pandas DataFrame.
    This function is designed for use in Google Colab.
    """
    try:
        from google.colab import files
        print("Please upload the Real Estate Valuation dataset CSV file.")
        uploaded = files.upload()

        # Get the filename and read the data
        filename = next(iter(uploaded))
        df = pd.read_csv(dataset.)
        return df
    except ImportError:
        print("This code is designed for Google Colab. Please run it there and upload the file.")
        print("For local execution, you can replace the function call with pd.read_csv('your_file_path.csv')")
        return None

# --- Step 1: Load the dataset ---
# The dataset has no missing values, but this step is still crucial.
# Dataset link: https://archive.ics.uci.edu/dataset/477/real+estate+valuation+data+set
df = load_data_from_colab()

if df is None:
    exit()

print("--- Data Loaded Successfully ---")
print("\n--- Initial DataFrame Info ---")
df.info()
print("\n--- First 5 rows of the raw data ---")
print(df.head())

# --- Step 2: Preprocessing ---
# This dataset is relatively clean, so we focus on splitting and scaling.
# Drop the 'No' column which is just an index.
df.drop('No', axis=1, inplace=True)

# Separate features (X) and target (y)
# The target variable 'Y house price of unit area' is the last column
X = df.iloc[:, :-1]  # All columns except the last one
y = df.iloc[:, -1]   # The last column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features. This is a crucial step for regularized models
# like Ridge and Lasso as they are sensitive to the scale of features.
print("\n--- Scaling numerical features ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# --- Step 3: Model Training ---
# We will train three different models for comparison:
# 1. Standard Linear Regression
# 2. Ridge Regression (L2 Regularization)
# 3. Lasso Regression (L1 Regularization)

print("\n--- Training Linear, Ridge, and Lasso Regression Models ---")

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

# Ridge Regression (we can experiment with the alpha parameter)
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train_scaled, y_train)

# Lasso Regression (we can experiment with the alpha parameter)
lasso_reg = Lasso(alpha=0.01)
lasso_reg.fit(X_train_scaled, y_train)


# --- Step 4: Model Evaluation ---
# Evaluate the models on the test set
y_pred_lin = lin_reg.predict(X_test_scaled)
y_pred_ridge = ridge_reg.predict(X_test_scaled)
y_pred_lasso = lasso_reg.predict(X_test_scaled)

# Calculate and print performance metrics (Mean Squared Error and R-squared)
print("\n--- Model Performance Evaluation ---")
print(f"Linear Regression R^2 Score: {r2_score(y_test, y_pred_lin):.4f}")
print(f"Ridge Regression R^2 Score: {r2_score(y_test, y_pred_ridge):.4f}")
print(f"Lasso Regression R^2 Score: {r2_score(y_test, y_pred_lasso):.4f}")

# --- Step 5: Visualization ---
# Visualize the results to understand the differences between the models.

# Plot a comparison of the coefficients
feature_names = X.columns
models = ['Linear Regression', 'Ridge Regression', 'Lasso Regression']
coefficients = [lin_reg.coef_, ridge_reg.coef_, lasso_reg.coef_]

plt.figure(figsize=(15, 8))
plt.title('Comparison of Coefficients for Different Regression Models', fontsize=16)

# Create a bar chart for coefficients
x = np.arange(len(feature_names))
width = 0.25

plt.bar(x - width, coefficients[0], width, label=models[0])
plt.bar(x, coefficients[1], width, label=models[1])
plt.bar(x + width, coefficients[2], width, label=models[2])

plt.xticks(x, feature_names, rotation=45, ha="right")
plt.ylabel('Coefficient Value')
plt.xlabel('Features')
plt.legend()
plt.tight_layout()
plt.show()

# Plot the predicted vs true values for Lasso regression
plt.figure(figsize=(10, 6))
plt.title('Lasso Regression: Predicted vs. True Values', fontsize=16)
sns.regplot(x=y_test, y=y_pred_lasso, ci=None, color='blue', line_kws={'color':'red'})
plt.xlabel('True Values (House Price)')
plt.ylabel('Predicted Values (House Price)')
plt.show()

# Create a single plot to show the relationship between true vs predicted values for all models.
plt.figure(figsize=(12, 8))
plt.title('Predicted vs. True Values for All Models', fontsize=16)

plt.scatter(y_test, y_pred_lin, color='blue', alpha=0.5, label='Linear Regression')
plt.scatter(y_test, y_pred_ridge, color='green', alpha=0.5, label='Ridge Regression')
plt.scatter(y_test, y_pred_lasso, color='red', alpha=0.5, label='Lasso Regression')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2) # Diagonal line
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()


Please upload the Real Estate Valuation dataset CSV file.


Saving dataset.xlsx to dataset (3).xlsx


NameError: name 'dataset' is not defined