# Import library and Load data

In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import numpy as np

file_path = "../data/train.csv"
dataset = pd.read_csv(file_path)

# Feature selection

In [2]:
#Continuous features
kitchen_quality_column = 'KitchenQual'
continuous_features = ['LotArea', 'YearBuilt', '1stFlrSF', 'GrLivArea']

#categorical features
kitchen_quality_column = 'KitchenQual'
categorical_features = ['Neighborhood', 'HouseStyle', 'OverallQual', 'OverallCond',kitchen_quality_column]

#Targeted value
targeted = 'SalePrice'

#check for one hot or cardinal encoding
features_for_one_hot_encoding = ['Neighborhood', 'HouseStyle', 'OverallQual', 'OverallCond']
features_for_ordinal_encoding = [kitchen_quality_column]

# Total features extracted from dataset
features = dataset[continuous_features + categorical_features]
target = dataset [targeted]

# Feature processing

In [39]:

# Applying One hot encoder
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoder.fit(features[features_for_one_hot_encoding])
one_hot_encoder = encoder.transform(features[features_for_one_hot_encoding])

# Convert numpy arrays to DataFrames
one_hot_encoded_df = pd.DataFrame(one_hot_encoder, columns=encoder.get_feature_names_out(['Neighborhood', 'HouseStyle', 'OverallQual', 'OverallCond']))

# for cardinal encoder
kitchen_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1}
kitchen_quality_encoded = features[kitchen_quality_column].apply(lambda x: kitchen_quality_dict[x])
kitchen_quality_encoded.head()

kitchen_quality_encoded.apply(lambda x: x * 2).head(15)
features[kitchen_quality_column].apply(lambda x: x.lower()).head(15)

# Scaling
scaler = StandardScaler()
scaler.fit(features[continuous_features])
scaled_data = scaler.transform(features[continuous_features])

# Combine the preprocessed features
scaled_df = pd.DataFrame(data = scaled_data, columns = continuous_features)

X = pd.concat([one_hot_encoded_df, kitchen_quality_encoded, scaled_df], axis=1)


# Model building

## Model training

In [27]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.25, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

## Model evaluation

In [28]:
# Define the function for Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value   
#and the logarithm of the observed sales price
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_error(y_test, y_pred))
    return round(rmsle, precision)

# Calculate the RMSLE for training and testing sets
rmsle_train = compute_rmsle(y_train, y_pred_train)
rmsle_test = compute_rmsle(y_test, y_pred_test)

print(f'Training RMSLE: {rmsle_train}')
print(f'Testing RMSLE: {rmsle_test}')

Training RMSLE: 31469.27
Testing RMSLE: 30707.51


In [29]:
X_train

Unnamed: 0,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,...,OverallCond_5,OverallCond_6,OverallCond_7,OverallCond_8,OverallCond_9,KitchenQual,LotArea,YearBuilt,1stFlrSF,GrLivArea
1023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,3,-0.735111,1.117235,0.883345,-0.021823
810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,3,-0.037766,0.090492,0.378759,-0.393039
1384,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2,-0.146006,-1.068734,-1.202278,-0.490127
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2,0.182922,-0.373198,0.671160,-0.177924
813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2,-0.076853,-0.439440,0.722912,-0.139851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,3,-0.120249,1.150356,0.391697,-0.383521
1130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,-0.271885,-1.433062,0.427923,0.886229
1294,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,2,-0.235003,-0.538802,-0.772733,-1.240174
860,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,3,-0.288121,-1.764269,-0.648527,-0.170310


In [None]:
X_test

In [30]:
y_pred = model.predict(X_test)
y_pred[:5]

array([146518.94931781, 329489.02310858, 105673.40712581, 168191.39365358,
       323842.17924014])

In [31]:
y_pred

array([146518.94931781, 329489.02310858, 105673.40712581, 168191.39365358,
       323842.17924014,  87075.34647248, 247645.9409465 , 137485.96972361,
        86355.777178  , 132222.45556876, 145925.253611  , 127552.70998524,
        82142.77622115, 238526.32271373, 174747.25751783, 140890.54619203,
       201040.15944041, 139830.46410478, 100020.67660571, 205357.02331127,
       156087.78327287, 238834.74925763, 177131.86533206, 126101.83468873,
       199669.3409697 , 165430.73888241, 197351.4411345 , 119003.13974819,
       169248.25401443, 206227.85581371, 115092.56723641, 253532.1098362 ,
       295018.5800515 , 127774.73404964, 259893.94012198, 147528.49811519,
       149898.93888892, 216193.19042803, 342889.45044507, 102008.52706626,
       125699.00210138, 211973.2263634 , 119262.12041776, 312811.61654373,
       113834.48599487, 118513.45823822, 112446.59752521, 130614.68109354,
       376614.56805482, 136701.93687131, 126839.66860083, 177681.5878008 ,
       108441.43795114, 3

In [42]:
X.to_parquet('../data/processed_df.parquet', index=False, engine='pyarrow')

In [5]:
expected_processed_df = pd.read_parquet('../data/processed_df.parquet')
expected_processed_df

Unnamed: 0,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,...,OverallCond_5,OverallCond_6,OverallCond_7,OverallCond_8,OverallCond_9,KitchenQual,LotArea,YearBuilt,1stFlrSF,GrLivArea
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,3,-0.207142,1.050994,-0.793434,0.370333
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,2,-0.091886,0.156734,0.257140,-0.482512
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,3,0.073480,0.984752,-0.627826,0.515013
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,3,-0.096897,-1.863632,-0.521734,0.383659
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,3,0.375148,0.951632,-0.045611,1.299326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2,-0.260560,0.918511,-0.542435,0.250402
1456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2,0.266407,0.222975,2.355701,1.061367
1457,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,3,-0.147810,-1.002492,0.065656,1.569647
1458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,3,-0.080160,-0.704406,-0.218982,-0.832788


In [46]:
pd.testing.assert_frame_equal(X, expected_processed_df)

# Dataset Loading and Splitting

In [ ]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
file_path = "data/train.csv"
dataset = pd.read_csv(file_path)

# Split the data into train and test sets
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)


# Model Building
## Model Training

In [ ]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

# Define preprocessing for numerical and categorical features
numeric_features = ["feature1", "feature2", "feature3"]  # Replace with actual numeric feature names
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ["feature4", "feature5"]  # Replace with actual categorical feature names
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create preprocessing and training pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LinearRegression())])

# Train the model
X_train = train_set.drop('target', axis=1)  # Replace 'target' with actual target column name
y_train = train_set['target']
model.fit(X_train, y_train)


## Model Evaluation

In [ ]:
from sklearn.metrics import mean_squared_error

# Preprocess the test set and make predictions
X_test = test_set.drop('target', axis=1)  # Replace 'target' with actual target column name
y_test = test_set['target']
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


# Model Inference

In [ ]:
# Load inference data
inference_file_path = "data/test.csv"
inference_data = pd.read_csv(inference_file_path)

# Preprocess inference data and make predictions
inference_predictions = model.predict(inference_data)

# Save predictions to a CSV file
inference_data['predictions'] = inference_predictions
inference_data.to_csv("data/inference_predictions.csv", index=False)
