
#### Roll no 102103430
Rimjhim Mittal
3CO16



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#### Q1. K-Fold Cross Validation for Multiple Linear Regression (Least Square Error Fit)

Implement a 5-fold cross-validation technique for multiple linear regression using the least square error method on the USA House Price dataset.

**Dataset link** https://drive.google.com/file/d/1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX/view

**Instructions:**

1. **Data Preparation:**
    * Load the USA House Price dataset.
    * Segment the dataset into predictor variables (all columns excluding 'Price') and the target variable ('Price').

2. **Feature Scaling:**
    * Scale the predictor variables to ensure they are on the same scale. This can be achieved using techniques such as MinMax scaling or Standard scaling.

3. **5-Fold Cross-Validation Setup:**
    * Partition the predictor variables and the target variable into five equal subsets or "folds".

4. **Model Training and Evaluation:**
    * For each iteration (total of 5):
        - Treat one fold as the validation set and the remaining four folds as the training set.
        - Compute the beta (\( \beta \)) matrix using the least square error fit method.
        - Predict the output for the validation set using the computed \( \beta \) matrix.
        - Evaluate the performance of the model using the \( R^2 \) score for the validation set.
    * Keep track of the \( \beta \) matrix that yields the highest \( R^2 \) score.

5. **Final Model Assessment:**
    * Use the optimal \( \beta \) matrix (corresponding to the highest \( R^2 \) score) from the cross-validation phase.
    * Train the regressor on 70% of the entire dataset.
    * Test the regressor's performance on the remaining 30% of the dataset.



In [None]:
def load_data(filename):
    return pd.read_csv(filename)

def normalize_data(df):
    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)



In [None]:
def train_sklearn_regression(x_train, y_train, x_test, y_test):
    reg = LinearRegression()
    reg.fit(x_train, y_train)
    predictions = reg.predict(x_test)
    return reg.coef_, r2_score(y_test, predictions), mean_absolute_error(y_test, predictions)
def gradient_descent_regression(X, Y, learning_rate, iterations):
    m, n = X.shape
    theta = np.zeros((n, 1))
    for _ in range(iterations):
        predictions = X @ theta
        errors = predictions - Y
        gradients = (1/m) * X.T @ errors
        theta -= learning_rate * gradients
    return theta




In [None]:

def main():
    # Load and preprocess the data
    df = load_data('./USA_Housing.csv')
    df = normalize_data(df)

    df_train_validate, df_test = train_test_split(df, test_size=0.3, random_state=42)
    df_train, df_validate = train_test_split(df_train_validate, test_size=0.25, random_state=42)


    # Separate features and target variable
    features = ['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms',
                'Avg. Area Number of Bedrooms','Area Population']
    x_train = df_train[features]
    y_train = df_train['Price']
    x_validate = df_validate[features]
    y_validate = df_validate['Price']
    x_test = df_test[features]
    y_test = df_test['Price']

    # Train the model using sklearn
    coef, r2, mae = train_sklearn_regression(x_train, y_train, x_test, y_test)
    print(f"Sklearn Regression R2 Score: {r2}")
    print(f"Sklearn Regression Coefficients: {coef}")
    print(f"Sklearn Regression MAE: {mae}")

    # Train the model using gradient descent
    X = np.c_[np.ones((x_train.shape[0], 1)), x_train]
    theta = gradient_descent_regression(X, y_train.values.reshape(-1, 1), 0.01, 1000)
    print(f"Gradient Descent Coefficients: {theta}")

if __name__ == "__main__":
    main()


Sklearn Regression R2 Score: 0.9147262743223825
Sklearn Regression Coefficients: [0.78972693 0.46379384 0.36821131 0.00512161 0.43001234]
Sklearn Regression MAE: 0.03307267725934341
Gradient Descent Coefficients: [[0.16299955]
 [0.1839815 ]
 [0.14779516]
 [0.11516578]
 [0.06516494]
 [0.14645295]]


#### Q2. Concept of Validation set for Multiple Linear Regression (Gradient Descent Optimization)


Evaluate the performance of multiple linear regression using gradient descent with various learning rates on the USA House Price dataset.

**Instructions:**

1. **Data Preparation:**
    * Load the USA House Price dataset.
    * Partition the dataset into:
        - Training set (56%)
        - Validation set (14%)
        - Test set (30%)

2. **Model Training:**
    * Consider the following learning rates: {0.001, 0.01, 0.1, 1}.
    * For each learning rate:
        - Train the multiple linear regression model using gradient descent for 1000 iterations.
        - Record the regression coefficients (\( \beta \) values) after the iterations.

3. **Model Evaluation:**
    * For each set of regression coefficients obtained from different learning rates:
        - Predict the outcomes for the validation set and compute the \( R^2 \) score.
        - Predict the outcomes for the test set and compute the \( R^2 \) score.

4. **Model Selection:**
    * Identify the regression coefficients that give the highest \( R^2 \) score on the validation set.
    * This set of coefficients represents the best model.


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from random import randrange
import warnings
warnings.filterwarnings('ignore')
# Split the dataset into training and testing sets
def custom_split(data, ratio):
    train_set = pd.DataFrame()
    desired_train_size = int(ratio * len(data))
    data_copy = data.copy()

    while len(train_set) < desired_train_size:
        idx = randrange(len(data_copy))
        train_set = train_set.append(data_copy.iloc[idx], ignore_index=True)
        data_copy = data_copy.drop(index=idx).reset_index(drop=True)

    return train_set, data_copy

# Implementing gradient descent for regression
def gradient_descent(X, Y, lr, num_iterations):
    num_samples = Y.shape[0]
    coefficients = np.zeros((X.shape[1], 1))

    for _ in range(num_iterations):
        predictions = np.dot(X, coefficients)
        error = predictions - Y
        gradient = (1/num_samples) * np.dot(X.T, error)
        coefficients -= lr * gradient

    return coefficients

data = pd.read_csv('./USA_Housing.csv')
normalized_data = (data - data.min()) / (data.max() - data.min())

train_val_set, test_set = custom_split(normalized_data, 0.7)
train_set, val_set = custom_split(train_val_set, 0.8)

features = ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
            'Avg. Area Number of Bedrooms', 'Area Population']

X_train = np.column_stack((np.ones(len(train_set)), train_set[features].values))
y_train = train_set['Price'].values.reshape(-1, 1)
X_val = np.column_stack((np.ones(len(val_set)), val_set[features].values))
y_val = val_set['Price'].values.reshape(-1, 1)
X_test = np.column_stack((np.ones(len(test_set)), test_set[features].values))
y_test = test_set['Price'].values.reshape(-1, 1)

best_r2_val = float('-inf')
best_coefficients = None

# Training the model using different learning rates
learning_rates = [0.001, 0.01, 0.1, 1]

for lr in learning_rates:
    coeffs = gradient_descent(X_train, y_train, lr, 1000)
    predictions_val = np.dot(X_val, coeffs)
    r2_val = r2_score(y_val, predictions_val)

    if r2_val > best_r2_val:
        best_r2_val = r2_val
        best_coefficients = coeffs

# Evaluate the model on the test set using the best coefficients
predictions_test = np.dot(X_test, best_coefficients)
r2_test = r2_score(y_test, predictions_test)

print("R2 Score on Test Set:", r2_test)
print("Best R2 Score on Validation Set:", best_r2_val)
print("Best Regression Coefficients:", best_coefficients)


R2 Score on Test Set: 0.8136488565357357
Best R2 Score on Validation Set: 0.8050844247916222
Best Regression Coefficients: [[-0.21189417]
 [ 0.49591645]
 [ 0.33031092]
 [ 0.20804856]
 [ 0.02456285]
 [ 0.30067222]]


#### Q3. Pre-processing and Multiple Linear Regression


Predict car prices using linear regression, with and without dimensionality reduction through PCA, on the provided dataset.

**Dataset link** https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

**Instructions:**

1. **Data Loading and Cleaning:**
    * Load the dataset using the provided column names: ["symboling", "normalized_losses", ... , "price"]
    * Replace all "?" entries with NaN values for consistency.

2. **Data Imputation:**
    * For the 'price' column, remove any rows containing NaN values.
    * For all other columns, replace NaN values using central tendency imputation methods (like mean or median).

3. **Data Encoding:**
    * For the "num_doors" and "num_cylinders" columns, convert textual number representations to their numeric equivalents (e.g., "two" to 2).
    * Apply dummy encoding for columns "body_style" and "drive_wheels".
    * Use label encoding for “make”, “aspiration”, “engine_location”, and "fuel_type" columns.
    * For the "fuel_system" column, assign a value of 1 if the entry contains the string "pfi", otherwise assign 0.
    * For the "engine_type" column, assign a value of 1 if the entry contains the string "ohc", otherwise assign 0.

4. **Feature Selection and Scaling:**
    * Segregate the dataset into predictor variables (all columns excluding 'price') and the target variable ('price').
    * Normalize the predictor variables to ensure all are on a similar scale.

5. **Linear Regression - Training and Evaluation:**
    * Partition the dataset: use 70% for training and 30% for testing.
    * Train a linear regression model using the training set and evaluate its performance on the test set.

6. **Dimensionality Reduction and Model Retraining:**
    * Implement PCA (Principal Component Analysis) to reduce the dimensionality of the feature set.
    * Retrain the linear regression model using 70% of the transformed data and test its performance on the remaining 30%.
    * Compare the performance of the model trained with PCA-transformed data to the one trained on the original data to determine if there's an improvement in prediction accuracy.



In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error

# Map to convert word numbers to actual numbers
number_mapping = {
    'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
    'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'twelve': 12
}

def word_to_num(word):
    return number_mapping.get(word, word)

def load_data():
    df = pd.read_csv('./imports-85.data', na_values="?")
    df.columns = ["symboling", "normalized_losses","make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels","engine_location", "wheel_base", "length", "width", "height", "curb_weight","engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke","compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

    df.dropna(subset=['price'], inplace=True)
    return df

def encode_non_numeric_data(df):
    # Convert word numbers to actual numbers
    df['num_doors'] = df['num_doors'].apply(word_to_num)
    df['num_cylinders'] = df['num_cylinders'].apply(word_to_num)

    # One-hot encode
    df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'])

    # Label encode
    for col in ["make", "aspiration", "engine_location", "fuel_type"]:
        df[col] = LabelEncoder().fit_transform(df[col])

    # Binary encode based on presence of specific substrings
    df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in x else 0)
    df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in x else 0)

    return df

def impute_data(df):
    imputer = SimpleImputer(strategy="most_frequent")
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return df

def train_and_evaluate(df):
    x = df.drop("price", axis=1)
    y = df[["price"]]

    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    y = scaler.fit_transform(y)

    X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size=0.7, random_state=0)

    model = LinearRegression().fit(X_train, Y_train)
    Y_pred = model.predict(X_test)

    mae = mean_absolute_error(Y_test, Y_pred)
    return mae

def main():
    df = load_data()
    df = impute_data(df)
    df = encode_non_numeric_data(df)

    # Evaluate model with original data
    error_original = train_and_evaluate(df)
    print(f"MAE with original data: {error_original}")

    # Apply PCA
    pca = PCA(n_components=20)
    x_pca = pca.fit_transform(df.drop("price", axis=1))
    df_pca = pd.DataFrame(x_pca, columns=[f"PC{i}" for i in range(1, 21)])
    df_pca["price"] = df["price"].values

    # Evaluate model with PCA data
    error_pca = train_and_evaluate(df_pca)
    print(f"MAE with PCA data: {error_pca}")

if __name__ == "__main__":
    main()


MAE with original data: 0.07057604698276639
MAE with PCA data: 0.07352461241675472
