<a href="https://colab.research.google.com/github/inspire007/KaggleCompetitions/blob/main/house-prices-advanced-regression-techniques/house_prices_advanced_regression_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

data = pd.read_csv('train.csv')

X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

X_train_o, X_test_o, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

num_cols = selector(dtype_include='number')
cat_cols = selector(dtype_include='object')

num_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num_p', num_pipe, num_cols),
    ('cat_p', cat_pipe, cat_cols)
], remainder = 'passthrough')

X_train = preprocessor.fit_transform(X_train_o)
X_test = preprocessor.transform(X_test_o)

models = {
    'RandomForest': RandomForestRegressor(n_estimators=500, random_state=42),
    'DecTree': DecisionTreeRegressor(random_state=42),
    'SVR': SVR(kernel='rbf'),
    'linearR': LinearRegression()
}

max_acc = 0
selected_model = False

for name,model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  r2 = r2_score(y_test, y_pred)
  print(f'{name} model r2 score: {r2}\n')
  if r2 > max_acc:
    max_acc = r2
    selected_model = model

print('Selected model: ', selected_model, '\nMax accuracy: ', max_acc)

X = preprocessor.fit_transform(X)
selected_model.fit(X,y)

data = pd.read_csv('test.csv')
X_test_actual = data.iloc[:, 1:]
X_test_actual = preprocessor.transform(X_test_actual)

y_pred_actual = selected_model.predict(X_test_actual)
pid = data.iloc[:, 0].values

np.savetxt('housing_output.csv', np.column_stack((pid, y_pred_actual)), delimiter=',', comments='', header='Id,SalePrice', fmt=['%d', '%.3f'])


RandomForest model r2 score: 0.8956128641464159

DecTree model r2 score: 0.7777274383941502

SVR model r2 score: -0.02520067056392561

linearR model r2 score: 0.8876326710509317

Selected model:  RandomForestRegressor(n_estimators=500, random_state=42) 
Max accuracy:  0.8956128641464159
