**Linear Regression (Supervised Learning - Regression)**

**Task:** Predict a continuous value (e.g., house prices).

**Dataset:** California Housing Dataset (built into scikit-learn).

In [5]:
# Install scikit-learn
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Import necessary libraries
from sklearn.datasets import fetch_california_housing  # To load the California housing dataset
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets
from sklearn.linear_model import LinearRegression  # To create and train a Linear Regression model
from sklearn.metrics import mean_squared_error  # To evaluate the model's performance
from sklearn.metrics import r2_score  # To evaluate the model's performance
import numpy as np  # To handle numerical operations

# Step 1: Load the California Housing dataset
data = fetch_california_housing()  # Fetches the dataset, which contains house prices and related features

# Step 2: Extract features (X) and target variable (y)
X = data.data  # Features (e.g., population, median income, house age, etc.)
y = data.target  # Target variable (median house value in $100,000s)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# `test_size=0.2` means 20% of the data is used for testing, 80% for training
# `random_state=42` ensures reproducibility, so the same data split happens every time the code runs

# Step 4: Create and train a Linear Regression model
model = LinearRegression()  # Initializes the Linear Regression model
model.fit(X_train, y_train)  # Trains the model using the training data (X_train, y_train)

# Step 5: Make predictions on the test data
y_pred = model.predict(X_test)  # Uses the trained model to predict house prices for the test set

# Step 6: Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)  # Calculates the average squared difference between actual and predicted values
print("Mean Squared Error (MSE):", mse)  # Prints the error value

r2 = r2_score(y_test, y_pred) #Added r2 score.
print("R-squared (R2):", r2)

# To prevent potential overflow issues, convert predictions to a NumPy array
y_pred = np.array(y_pred)
# y_pred


Mean Squared Error (MSE): 0.5558915986952426
R-squared (R2): 0.5757877060324521


array([0.71912284, 1.76401657, 2.70965883, ..., 4.46877017, 1.18751119,
       2.00940251])

**Task:**
1. Predict Car Prices using a Custom Dataset

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets
from sklearn.linear_model import LinearRegression  # To create and train a Linear Regression model
from sklearn.metrics import mean_squared_error  # To evaluate the model's performance
from sklearn.impute import SimpleImputer  # To handle missing values
from sklearn.metrics import r2_score  
import numpy as np

# Step 1: Load the dataset
df = pd.read_csv("D:\\CUBICLES\\car-sale-advertisements\\car_ad.csv", encoding='ISO-8859-1') #replace with your file path.
# print(df.head())
# df.info()

# Step 2: Extract features (X) and target variable (y)
X = df[['year', 'mileage', 'engV']]
y = df['price']

# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Create and train a Linear Regression model
model = LinearRegression()  
model.fit(X_train, y_train)  

# Step 5: Make predictions on the test data
y_pred = model.predict(X_test) 

# Step 6: Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)  # Calculates the average squared difference between actual and predicted values
formatted_mse = "{:.5f}".format(mse % 100) #Modulus 100 to get last 2 digits before decimal.
print(f"Mean Squared Error (MSE): {formatted_mse}")

r2 = r2_score(y_test, y_pred) #Added r2 score.
print("R-squared (R2):", r2)

y_pred = np.array(y_pred)
print(y_pred)

Mean Squared Error (MSE): 68.53786
R-squared (R2): 0.14461985527651156
[11877.47512055 18549.59601949 26632.39529023 ... 15720.65725138
 19436.20200534 15336.03759624]


**Task:**

2. Salary Prediction based on Experience

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets
from sklearn.linear_model import LinearRegression  # To create and train a Linear Regression model
from sklearn.metrics import mean_squared_error  # To evaluate the model's performance
from sklearn.impute import SimpleImputer  # To handle missing values
from sklearn.metrics import r2_score  # To calculate the R-squared score
import numpy as np

# Step 1
df = pd.read_csv("D:\\CUBICLES\\employee-data-simulation-it-industry\\employee_data.csv") #replace with your file path.
# print(df.head())

# df.info()

# Step 2
X = df[['Experience (Years)']]
y = df['Salary']

# Step 3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Step 4
model = LinearRegression()  
model.fit(X_train, y_train) 

# Step 5
y_pred = model.predict(X_test) 
# print(y_pred)

# Step 6
mse = mean_squared_error(y_test, y_pred)  
formatted_mse = "{:.5f}".format(mse % 100) #Modulus 100 to get last 2 digits before decimal.
print(f"Mean Squared Error (MSE): {formatted_mse}")

r2 = r2_score(y_test, y_pred) #Added r2 score.
print("R-squared (R2):", r2)



Mean Squared Error (MSE): 12.10690
R-squared (R2): 0.42143674081968385


**Task:**

3. House Price Prediction with Feature Engineering

In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets
from sklearn.linear_model import LinearRegression  # To create and train a Linear Regression model
from sklearn.metrics import mean_squared_error  # To evaluate the model's performance
from sklearn.impute import SimpleImputer  # To handle missing values
from sklearn.metrics import r2_score  # To calculate the R-squared score
import numpy as np

# Step 1
df = pd.read_csv("D:\\CUBICLES\\house-price-prediction-dataset\\House Price Prediction Dataset.csv") #replace with your file path.
# print(df.head())

# df.info()

# Create new features
df['PricePerRoom'] = df['Price'] / (df['Bedrooms'] + df['Bathrooms'])  # Assuming rooms = bedrooms + bathrooms
df['AgeFactor'] = (2025 - df['YearBuilt']) / 10  # Calculate age based on current year

# Step 2
X = df[['Area','Floors','YearBuilt','PricePerRoom','AgeFactor','Bedrooms','Bathrooms']]  # Features
y = df['Price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4
model = LinearRegression()  
model.fit(X_train, y_train) 

# Step 5
y_pred = model.predict(X_test) 
# print(y_pred)

# Step 6
mse = mean_squared_error(y_test, y_pred)  
formatted_mse = "{:.5f}".format(mse % 100) #Modulus 100 to get last 2 digits before decimal.
print(f"Mean Squared Error (MSE): {formatted_mse}")

r2 = r2_score(y_test, y_pred) #Added r2 score.
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 40.42844
R-squared (R2): 0.7559786670107435
