In [None]:
# Q.39 - develop a linear regression model to predict house prices based on various features. You will use a 
# dataset that contains information about houses, including features such as the number of bedrooms, 
# square footage, and location. You will evaluate the model's performance using metrics such as Mean 
# Absolute Error (MAE), Mean Squared Error (MSE), and R-squared. Show performance of same model 
# when PCA reduced data set is used. 

# Import required libraries
import pandas as pd                      # for data handling
import numpy as np                       # for numerical operations
from sklearn.model_selection import train_test_split   # for splitting data
from sklearn.linear_model import LinearRegression       # linear regression model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # evaluation metrics
from sklearn.preprocessing import StandardScaler        # for feature scaling
from sklearn.decomposition import PCA                   # for dimensionality reduction

# Load dataset (replace with your actual file name)
df = pd.read_csv("house_data.csv")      # load housing dataset

# Display first few rows
print(df.head())                        # check dataset structure

# Separate features and target
X = df.drop("price", axis=1)            # independent variables
y = df["price"]                         # dependent variable (target)

# Convert categorical columns into numeric form
X = pd.get_dummies(X, drop_first=True) # one-hot encode categorical features

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)                                       # 80% train, 20% test

# Create linear regression model
model = LinearRegression()              # initialize model

# Train model
model.fit(X_train, y_train)             # fit model on training data

# Make predictions
y_pred = model.predict(X_test)          # predict house prices

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)   # calculate MAE
mse = mean_squared_error(y_test, y_pred)    # calculate MSE
r2 = r2_score(y_test, y_pred)               # calculate R² score

print("Without PCA")
print("MAE:", mae)
print("MSE:", mse)
print("R2 Score:", r2)

# ------------------ PCA VERSION ------------------

# Scale features before PCA
scaler = StandardScaler()               # initialize scaler
X_scaled = scaler.fit_transform(X)      # scale features

# Apply PCA
pca = PCA(n_components=0.95)            # keep 95% variance
X_pca = pca.fit_transform(X_scaled)     # reduce dimensions

# Split PCA data
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
    X_pca, y, test_size=0.2, random_state=42
)                                       # split PCA data

# Train linear regression on PCA data
model_pca = LinearRegression()          # initialize model
model_pca.fit(X_train_pca, y_train_pca) # train model

# Predict using PCA model
y_pred_pca = model_pca.predict(X_test_pca)  # predictions

# Evaluate PCA model
mae_pca = mean_absolute_error(y_test_pca, y_pred_pca)  # MAE
mse_pca = mean_squared_error(y_test_pca, y_pred_pca)   # MSE
r2_pca = r2_score(y_test_pca, y_pred_pca)              # R²

print("\nWith PCA")
print("MAE:", mae_pca)
print("MSE:", mse_pca)
print("R2 Score:", r2_pca)


In [None]:
# Q.51 Consider Salary.csv , with years_of_experience and salary. Write a python code for fitting best fit 
# simple  linear  regression  with  independent  variable  years_of_experience  and  dependent  variable 
# salary.  Use  complete  dataset  to  train  model.  Plot  :fitted  model  along  with  trained  data  points  , 
# residual plot.

# Import required libraries
import pandas as pd                      # for data handling
import numpy as np                       # for numerical operations
import matplotlib.pyplot as plt          # for plotting
from sklearn.linear_model import LinearRegression  # linear regression model

# Load the dataset
df = pd.read_csv("Salary.csv")           # load Salary dataset

# Separate independent and dependent variables
X = df[['years_of_experience']]          # independent variable
y = df['salary']                         # dependent variable

# Create linear regression model
model = LinearRegression()               # initialize model

# Train model using complete dataset
model.fit(X, y)                          # fit model on full data

# Predict salary using trained model
y_pred = model.predict(X)                # predicted salaries

# Plot best fit line with data points
plt.figure()
plt.scatter(X, y)                        # plot actual data points
plt.plot(X, y_pred)                      # plot best fit line
plt.xlabel("Years of Experience")        # x-axis label
plt.ylabel("Salary")                     # y-axis label
plt.title("Simple Linear Regression: Salary vs Experience")
plt.show()                               # display plot

# Residual plot
residuals = y - y_pred                   # calculate residuals

plt.figure()
plt.scatter(X, residuals)                # plot residuals
plt.axhline(y=0)                         # reference line at zero
plt.xlabel("Years of Experience")        # x-axis label
plt.ylabel("Residuals")                  # y-axis label
plt.title("Residual Plot")
plt.show()                               # display plot
