Use dataset_regression

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Upload the dataset from your local desktop
uploaded = files.upload()  # Upload file through Colab interface

# Step 2: Load the dataset into pandas dataframe
file_name = next(iter(uploaded))  # Get the name of the uploaded file
df = pd.read_csv(file_name)

# Step 3: Check if the 'LotArea' and 'SalePrice' columns exist in the dataset
if 'LotArea' not in df.columns or 'SalePrice' not in df.columns:
    raise ValueError("'LotArea' or 'SalePrice' column is missing from the dataset.")

# Step 4: Select relevant columns and clean data
df = df[['LotArea', 'SalePrice']]

# Drop rows with missing values
df = df.dropna(subset=['LotArea', 'SalePrice'])

# Step 5: Split data into features (X) and target (y)
X = df[['LotArea']]
y = df['SalePrice']

# Step 6: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 8: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 9: Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

# Step 10: Visualize the results
plt.figure(figsize=(10,6))
plt.scatter(X_test, y_test, color='blue', label='Actual data')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Regression line')
plt.title('Linear Regression: LotArea vs SalePrice')
plt.xlabel('LotArea')
plt.ylabel('SalePrice')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset (Replace 'your_dataset.csv' with actual file path)
data = pd.read_csv('dataset_regression.csv')

# Select different sets of six features
feature_sets = [
    ['LotArea', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', 'GrLivArea', 'GarageCars'],
    ['MSSubClass', 'LotFrontage', 'BsmtFinSF1', '1stFlrSF', 'TotRmsAbvGrd', 'Fireplaces'],
    ['OverallCond', 'YearRemodAdd', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold']
]

def perform_regression(features):
    X = data[features]
    y = data['SalePrice']

    # Handle missing values
    X.fillna(X.mean(), inplace=True)
    y.fillna(y.mean(), inplace=True)

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluation
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Features: {features}")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}\n")

    # Plot Actual vs Predicted with regression line
    plt.figure(figsize=(8, 6))
    sns.regplot(x=y_test, y=y_pred, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
    plt.xlabel('Actual SalePrice')
    plt.ylabel('Predicted SalePrice')
    plt.title(f'Actual vs Predicted for Features: {features}')
    plt.show()

# Run regression for each feature set
for feature_set in feature_sets:
    perform_regression(feature_set)