Name: Saleh Abdallah

In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [31]:
# Load the dataset: Load the training dataset ‘train.csv‘ and display the first 10 rows to get an overview of the features and target variable.
df = pd.read_csv('train.csv')

In [16]:
# Handling Missing Values:
# Inspect the dataset for missing values.
df.isnull().sum()[df.isnull().sum() > 0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [17]:
# Impute missing values in the numerical columns using the median.
numerical = df.select_dtypes(include='number').columns
for i in numerical:
    df[i].fillna(df[i].median(), inplace=True)

# For categorical columns, impute missing values with the most frequent category.
categorical = df.select_dtypes(include='object').columns
for i in categorical:
    df[i].fillna(df[i].mode()[0], inplace=True)

print('Missing values is:', df.isnull().sum().sum())

Missing values is: 0


In [18]:
# Feature Scaling:
# Identify the numerical features in the dataset.
# Scale these numerical features using Min-Max scaling.
scaler = MinMaxScaler()
for i in numerical:
    df[i] = scaler.fit_transform(df[[i]])

In [19]:
# Encoding Categorical Features:
# Identify the categorical features in the dataset.
# Apply One-Hot Encoding to these categorical features.
df = pd.get_dummies(df, columns=categorical, drop_first=True)

In [None]:
# Feature Engineering:
# Create a new feature called ‘TotalSF‘ that combines ‘TotalBsmtSF‘, ‘1stFlrSF‘, and ‘2ndFlrSF‘ to represent the total square footage of the house.
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
# Create another feature called ‘Age‘ by subtracting the year the house was built (‘YearBuilt‘) from the year of sale (‘YrSold‘).
df['Age'] = df['YrSold'] - df['YearBuilt']

In [25]:
# Data Leakage Prevention:
# Split the dataset into training and testing sets (80% training, 20% testing).
X = df.drop(['Id', 'SalePrice'], axis=1)
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Explain how you would ensure that no data leakage occurs when handling missing values, scaling, or encoding.
###
# Handling missing values, scaling, or encoding should be applied after data splitting to avoid data leakage through exposing the test data to the model.
###

In [30]:
# Model Training:
# Train a simple linear regression model using the preprocessed features.
lr = LinearRegression()
lr.fit(X, y)
# Report the training and test performance using mean squared error (MSE).
lr_pred = lr.predict(X_test)
# Evaluate the model’s performance on the test set and comment on the results.
mse = round(mean_squared_error(y_test, lr_pred),5)
print('Mean Squared Error (MSE):', mse)
###
# A considerably low MSE impliews that the model predictions are very close to the actual values. However, since the data preprocessing took place prior to data splitting this can lead to inaccurate evaluation results.
###

Mean Squared Error (MSE): 0.00097
