# Import library and Load data

In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import numpy as np


file_path = "../data/train.csv"
dataset = pd.read_csv(file_path)
print("Datasets shape {}".format(dataset.shape))

# display data entries
print(dataset.head(10))

Datasets shape (1460, 81)
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   
5   6          50       RL         85.0    14115   Pave   NaN      IR1   
6   7          20       RL         75.0    10084   Pave   NaN      Reg   
7   8          60       RL          NaN    10382   Pave   NaN      IR1   
8   9          50       RM         51.0     6120   Pave   NaN      Reg   
9  10         190       RL         50.0     7420   Pave   NaN      Reg   

  LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1        

# Feature selection

In [2]:
#Continuous features
continuous_features = ['LotArea', 'YearBuilt', '1stFlrSF', 'GrLivArea']

#categorical features
categorical_features = ['Neighborhood', 'HouseStyle', 'OverallQual', 'OverallCond']

#Targeted value
targeted = 'SalePrice'


# Total features extracted from dataset
features = dataset[continuous_features + categorical_features]
target = dataset [targeted] 
print(features, target)

      LotArea  YearBuilt  1stFlrSF  GrLivArea Neighborhood HouseStyle  \
0        8450       2003       856       1710      CollgCr     2Story   
1        9600       1976      1262       1262      Veenker     1Story   
2       11250       2001       920       1786      CollgCr     2Story   
3        9550       1915       961       1717      Crawfor     2Story   
4       14260       2000      1145       2198      NoRidge     2Story   
...       ...        ...       ...        ...          ...        ...   
1455     7917       1999       953       1647      Gilbert     2Story   
1456    13175       1978      2073       2073       NWAmes     1Story   
1457     9042       1941      1188       2340      Crawfor     2Story   
1458     9717       1950      1078       1078        NAmes     1Story   
1459     9937       1965      1256       1256      Edwards     1Story   

      OverallQual  OverallCond  
0               7            5  
1               6            8  
2               7       

# Feature processing

In [3]:
# Initializing the scalers and encoders
scaler = StandardScaler()
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Scaling of continuous features
X_continuous_scaled = scaler.fit_transform(features[continuous_features])

# Encode of categorical features
X_categorical_encoded = encoder.fit_transform(features[categorical_features])

# Combine the preprocessed features
X_preprocessed = np.hstack((X_continuous_scaled, X_categorical_encoded))

# Model training

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, target, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Model evaluation with the competition metric

In [5]:
# Define the function for Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value   
#and the logarithm of the observed sales price
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_error(y_test, y_pred))
    return round(rmsle, precision)

# Calculate the RMSLE for training and testing sets
rmsle_train = compute_rmsle(y_train, y_pred_train)
rmsle_test = compute_rmsle(y_test, y_pred_test)

print(f'Training RMSLE: {rmsle_train}')
print(f'Testing RMSLE: {rmsle_test}')

Training RMSLE: 31129.0
Testing RMSLE: 33021.17


# Predicting and preprocessing for new data 
## For extra learning purpose

In [6]:
def preprocess_new_data(new_data, scaler, encoder):
    # Scale continuous features
    X_continuous_new = new_data[continuous_features]
    X_continuous_new_scaled = scaler.transform(X_continuous_new)
    
    # Encode categorical features
    X_categorical_new = new_data[categorical_features]
    X_categorical_new_encoded = encoder.transform(X_categorical_new)
    
    # Combine the preprocessed features
    X_new_preprocessed = np.hstack((X_continuous_new_scaled, X_categorical_new_encoded))
    
    return X_new_preprocessed

# Prediction
## For extra learning purpose

In [7]:
# Taking test.csv data to predict the SalePrice of first top 20
test_data = pd.read_csv('../data/test.csv')
test_data_top20 = test_data.head(20)

# Preprocess the test data
X_new_preprocessed_top20 = preprocess_new_data(test_data_top20, scaler, encoder)

# Predict SalePrice for the top 20 rows
y_pred_new_top20 = model.predict(X_new_preprocessed_top20)


print(f'Predicted SalePrices of top 20 rows : {y_pred_new_top20}')

Predicted SalePrices of top 20 rows : [125643.57605073 160259.27521063 167255.01649982 185958.02930398
 255893.74782837 175591.11688235 182344.47785411 163784.50833663
 185864.72808295 114162.95178562 179671.00847072  91659.48121201
  89357.43433576 150378.49511674 147591.71206877 371914.37847459
 278998.79561543 350168.25881926 293846.36213854 409498.43120888]
