In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from pygam import LinearGAM, s
from sklearn.metrics import mean_squared_error
import seaborn as sns

# Load dataset
house_df = pd.read_csv("kc_house_data.csv")  # Ensure the file is in the correct path

# Select relevant features (based on available columns in your dataset)
features = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "condition", 
            "grade", "sqft_above", "sqft_basement", "yr_built"]
target = "price"  # Updated target column

predictors = ['SqFtTotLiving']
outcome = 'AdjSalePrice'



simple_lm = LinearRegression()
simple_lm.fit(house_df[predictors], house_df[outcome])
# Example: Intercept ~ base, Coef ~ $ per sq ft

# Drop missing values
house_df = house_df.dropna(subset=features + [target])

X = house_df[features]
y = house_df[target]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Simple Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))
print(f"Linear Regression RMSE: {rmse_lin:.2f}")

# 2. Polynomial Regression (Degree 2)
poly = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly.fit(X_train, y_train)
y_pred_poly = poly.predict(X_test)
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))
print(f"Polynomial Regression (Degree 2) RMSE: {rmse_poly:.2f}")

# 3. Spline Regression using Generalized Additive Models (GAM)
gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9))
gam.fit(X_train, y_train)
y_pred_gam = gam.predict(X_test)
rmse_gam = np.sqrt(mean_squared_error(y_test, y_pred_gam))
print(f"Spline Regression (GAM) RMSE: {rmse_gam:.2f}")

# Cross-validation for model evaluation
cv_scores = cross_val_score(lin_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_cv = np.sqrt(-cv_scores.mean())
print(f"Cross-Validation RMSE: {rmse_cv:.2f}")

# Plot Residuals for Diagnostics
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_lin - y_test, alpha=0.5, label="Linear Regression", color='b')
plt.scatter(y_test, y_pred_poly - y_test, alpha=0.5, label="Polynomial Regression", color='r')
plt.scatter(y_test, y_pred_gam - y_test, alpha=0.5, label="Spline Regression", color='g')
plt.axhline(y=0, color='black', linestyle='--')
plt.xlabel("Actual Sale Price")
plt.ylabel("Residuals")
plt.legend()
plt.title("Residual Plot")
plt.show()


NameError: name 'house' is not defined

In [5]:
house_df = pd.read_csv("kc_house_data.csv")  # Ensure the file is in the correct path

# Select relevant features (based on available columns in your dataset)
features = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "condition", 
            "grade", "sqft_above", "sqft_basement", "yr_built"]
target = "price"  # Updated target column

predictors = ['sqft_living']
outcome = 'price'



simple_lm = LinearRegression()
simple_lm.fit(house_df[predictors], house_df[outcome])

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample Data (Replace with your actual dataset)
house_df = pd.DataFrame({
    'SqFtTotLiving': [1000, 1500, 2000, 2500, 3000, 3500, 4000],
    'AdjSalePrice': [200000, 250000, 300000, 350000, 400000, 450000, 500000]
})

# Define predictors and outcome
predictors = ['SqFtTotLiving']
outcome = 'AdjSalePrice'

# Fit the Linear Regression model
simple_lm = LinearRegression()
simple_lm.fit(house_df[predictors], house_df[outcome])

# Predictions
house_df['PredictedPrice'] = simple_lm.predict(house_df[predictors])

# Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=house_df['SqFtTotLiving'], y=house_df['AdjSalePrice'], label="Actual Data")
sns.lineplot(x=house_df['SqFtTotLiving'], y=house_df['PredictedPrice'], color='red', label="Fitted Line")
plt.xlabel("Size (SqFtTotLiving)")
plt.ylabel("Price (AdjSalePrice)")
plt.title("Price vs. Size Fit")
plt.legend()
plt.grid(True)
plt.show()


ModuleNotFoundError: No module named 'seaborn'

In [6]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.
