In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [10]:
# Load datasets
train_data = pd.read_csv('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/dsp-pradeepa-kujulva-arjunan/data/train.csv')
test_data = pd.read_csv('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/dsp-pradeepa-kujulva-arjunan/data/test.csv')

In [11]:
train_data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [12]:
test_data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
5,1466,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
6,1467,20,RL,,7980,Pave,,IR1,Lvl,AllPub,...,0,0,,GdPrv,Shed,500,3,2010,WD,Normal
7,1468,60,RL,63.0,8402,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal
8,1469,20,RL,85.0,10176,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2010,WD,Normal
9,1470,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,4,2010,WD,Normal


In [13]:
# Feature Selection
continuous_features = ['LotArea', 'GrLivArea']  # Adjust as needed
categorical_features = ['MSZoning', 'Street']   # Adjust as needed


In [14]:

numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])


In [15]:
# Data Setup: Splitting into train and test sets
X = train_data[continuous_features + categorical_features]
y = train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [16]:
# Model Training
model = RandomForestRegressor()
model.fit(X_train_preprocessed, y_train)

In [18]:
# Predictions
y_pred = model.predict(X_test_preprocessed)


In [19]:
y_pred

array([133283.10714286, 301675.65      , 102722.        , 168354.        ,
       212721.15      ,  80540.        , 292684.16      , 153895.        ,
        81440.        , 150795.14      , 176775.64      , 125932.5       ,
       126914.        , 205198.12      , 189336.18      , 112276.5       ,
       177069.        , 151546.37      ,  99885.61571429, 178115.        ,
       210052.77      , 203959.36      , 175567.4       , 109400.6       ,
       182032.2       , 135470.84666667, 156651.34      , 134080.5       ,
       194531.        , 184468.8       , 167273.        , 236850.74      ,
       257713.95      , 123701.        , 200177.        , 129269.        ,
       180421.53      , 200775.92      , 325767.23      , 116192.        ,
       110064.75      , 259847.92      , 120162.        , 281444.66      ,
       132760.5       , 136102.9       , 137282.        , 125023.8       ,
       265090.72      , 192150.        , 127433.68333333, 197780.41      ,
       116032.04      , 4

In [20]:
# Model Evaluation
def compute_rmse(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmse = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmse, precision)

In [21]:
rmse_score = compute_rmse(np.log(y_test), np.log(y_pred))
print("Root-Mean-Squared-Error (RMSE) Score:", rmse_score)

Root-Mean-Squared-Error (RMSE) Score: 0.02


In [22]:
actual_features = test_data[continuous_features + categorical_features]
prepared_actual_set = preprocessor.transform(actual_features)


In [23]:
actual_prediction_raw = model.predict(prepared_actual_set)
submission_df = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': actual_prediction_raw})