## GET LIBS + DATA

In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import pandas as pd
import numpy as np
import copy

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning, module="numpy")

In [2]:
df = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [3]:
train = df.copy()
test = df_test.copy()

train_objs_num = len(train)
dataset = pd.concat(objs=[train, test], axis=0)
dataset = pd.get_dummies(dataset)
train = copy.copy(dataset[:train_objs_num])
test = copy.copy(dataset[train_objs_num:])

df = train
df_test = test

##
## PREPROCESS PIPELINE

1. **ftiakse 2 diaforetika pipelines**
2. **apomonwse numerical kai categorical cols**
3. **ftiakse 1 pipeline pou enwnei ta num_pipe + cat_pipe me condition na ta kanei apply stis sthles pou prepei**
4. **apply pipeline se df**

In [4]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [5]:
numerical_columns = df.select_dtypes(include=['float64','int64']).columns
categorical_columns = df.select_dtypes(include=['object','category']).columns

# because SalePrice is the target
numerical_columns = numerical_columns.drop('SalePrice', errors='ignore')

In [6]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder = 'passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [7]:
X = df.drop('SalePrice', axis=1)

# AN DEN KANW DROP TO SALEPRICE APO NUMERICAL_COLUMNS ERROR
X_preprocessed = pipeline.fit_transform(X)
y = np.log(df['SalePrice'])

##
## MODELLING

In [9]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

### chat gpt can create this block
1. im using these models
2. create parameter grid
3. train the grids + produce best results and best parameters

In [11]:
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

param_grids = {
    'LinearRegression': {},
    'RandomForest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 100],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': [100,200,500],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10]
    }
}

cv = KFold(n_splits=3, shuffle=True, random_state=42)

### train and tune models

In [12]:
grids = {}

for model_name, model in models.items():
    
    print(f'Training and Tuning {model_name}.')
    
    grids[model_name] = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        cv=cv,
        scoring='neg_mean_squared_error'
    )
    grids[model_name].fit(X_train, y_train)
    best_params = grids[model_name].best_params_
    best_score = np.sqrt(-1 * grids[model_name].best_score_)
    
    print(f'Best parameters for {model_name} are {best_params}')
    print(f'Best score for {model_name} is {best_score}')
    print()

Training and Tuning LinearRegression.
Best parameters for LinearRegression are {}
Best score for LinearRegression is 6522806961.724952

Training and Tuning RandomForest.
Best parameters for RandomForest are {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
Best score for RandomForest is 0.15316547438561767

Training and Tuning XGBoost.
Best parameters for XGBoost are {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Best score for XGBoost is 0.13691483027194776




**neural network model**
1. build it
2. print it
3. evaluate

In [13]:
from sklearn.neural_network import MLPRegressor

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

mlp1 = MLPRegressor(
    random_state=42,
    max_iter=10000,
    n_iter_no_change=3,
    learning_rate_init=0.001
)

param_grid = {
    'hidden_layer_sizes': [(10,), (10, 10), (10, 10, 10), (25)],
    'activation': ['relu','tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant','invscaling', 'adaptive']
}

mlp2 = GridSearchCV(
    mlp1,
    param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    n_jobs=1,
    verbose=1
)

mlp2.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [14]:
print('Best parameters found for Neural Network Model are:\n', mlp2.best_params_)

best_score = np.sqrt(-1 * mlp2.best_score_)
print('Best score for Neural Network Model is: ', best_score)

Best parameters found for Neural Network Model are:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 25, 'learning_rate': 'constant', 'solver': 'adam'}
Best score for Neural Network Model is:  0.24765541460877707


**tsekare ta modela performance sto y_test**

In [15]:
from sklearn.metrics import mean_squared_error

print('neural network:\t\t', mean_squared_error(mlp2.predict(X_test), y_test))
print('linear regression:\t', mean_squared_error(grids['LinearRegression'].predict(X_test), y_test))
print('random forest:\t\t', mean_squared_error(grids['RandomForest'].predict(X_test), y_test))
print('XGBooster:\t\t', mean_squared_error(grids['XGBoost'].predict(X_test), y_test))

neural network:		 0.040837112641202704
linear regression:	 4.767835526919088e+17
random forest:		 0.021689077548682243
XGBooster:		 0.017681532847905777


##
##
##
##
## FEATURE ENGINEERING
### FTIAKSE PREPROCESS PIPELINE KSANA ALLA ME FEATURE ENGINEERING MESA

In [16]:
from sklearn.preprocessing import FunctionTransformer

def custom_features(df):
    
    df_out = df.copy()
    df_out['PropertyAge'] = df_out['YrSold'] - df_out['YearBuilt']
    df_out['TotalSF'] = df_out['TotalBsmtSF'] + df_out['1stFlrSF'] + df_out['2ndFlrSF']
    df_out['TotalBath'] = df_out['FullBath'] + 0.5 * df_out['HalfBath'] + df_out['BsmtFullBath'] + 0.5 * df_out['BsmtHalfBath']
    df_out['HasRemodeled'] = (df_out['YearRemodAdd'] != df_out['YearBuilt']).astype(object)
    df_out['Has2ndFloor'] = (df_out['2ndFlrSF'] > 0).astype(object)
    df_out['HasGarage'] = (df_out['GarageArea'] > 0).astype(object)
    df_out['YrSold_cat'] = df_out['YrSold'].astype(object)
    df_out['MoSold_cat'] = df_out['MoSold'].astype(object)
    df_out['YearBuilt_cat'] = df_out['YearBuilt'].astype(object)
    df_out['MSSubClass_cat'] = df_out['MSSubClass'].astype(object)
    
    return df_out

feature_engineering_transformer = FunctionTransformer(custom_features)

1. **exw hdh categorical_columns kai numerical_columns kane se aftes append tis extra**
2. **ksanaftiaxnw to preprocessor me to ColumnTransformer etsi wste na exei mesa oles tis sthles**
3. **vazw sto pipeline kai to FunctionTransformer pou orizei tis sthles. prwta afto sto pipeline giati ekei ftiaxnontai oi nees sthles prwth fora kai meta to preprocessor**
4. **kanw apply pipeline se X**

In [17]:
extra_categoricals = pd.Index(['HasRemodeled', 'Has2ndFloor', 'HasGarage', 'YrSold_cat',
                             'MoSold_cat', 'YearBuilt_cat', 'MSSubClass_cat'])
extra_numericals = pd.Index(['PropertyAge', 'TotalSF', 'TotalBath'])

numerical_columns = df.select_dtypes(include=['float64','int64']).columns.append(extra_numericals)
categorical_columns = df.select_dtypes(include=['object','category']).columns.append(extra_categoricals)
numerical_columns = numerical_columns.drop('SalePrice', errors='ignore')

In [18]:
preprocessor2 = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder = 'passthrough'
)

In [19]:
pipeline2 = Pipeline(steps=[
    ('fe', feature_engineering_transformer),
    ('preprocessor2', preprocessor2)
])

In [20]:
X2 = df.drop('SalePrice', axis=1)
y2 = df['SalePrice']
y2 = np.log(y2)

oi extra sthles tha prostethoun mesw tou pipeline ston X2_preprocessed

ara ston X2 den xreiazetai na tis prosthesw

In [21]:
X2_preprocessed = pipeline2.fit_transform(X2)

In [22]:
X2_preprocessed.shape

(1460, 442)

## MODELLING 2.0

In [23]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_preprocessed, y2, test_size=0.2, random_state=42)

models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

param_grids = {
    'LinearRegression': {},
    'RandomForest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 100],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': [100,200,500],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10]
    }
}

cv = KFold(n_splits=3, shuffle=True, random_state=42)

In [24]:
grids2 = {}

for model_name, model in models.items():
    
    print(f'Training and Tuning {model_name}.')
    
    grids2[model_name] = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        cv=cv,
        scoring='neg_mean_squared_error'
    )
    grids2[model_name].fit(X2_train, y2_train)
    best_params2 = grids2[model_name].best_params_
    best_score2 = np.sqrt(-1 * grids2[model_name].best_score_)
    
    print(f'Best parameters for {model_name} are {best_params2}')
    print(f'Best score for {model_name} is {best_score2}')
    print()

Training and Tuning LinearRegression.
Best parameters for LinearRegression are {}
Best score for LinearRegression is 753882840.8963152

Training and Tuning RandomForest.
Best parameters for RandomForest are {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best score for RandomForest is 0.14825511131098737

Training and Tuning XGBoost.
Best parameters for XGBoost are {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best score for XGBoost is 0.13577082939560242



## EVALUATE 2.0

In [25]:
from sklearn.metrics import mean_squared_error

#print('neural network:\t\t', mean_squared_error(mlp2.predict(X2_test), y2_test))
print('linear regression:\t', mean_squared_error(grids2['LinearRegression'].predict(X2_test), y2_test))
print('random forest:\t\t', mean_squared_error(grids2['RandomForest'].predict(X2_test), y2_test))
print('XGBooster:\t\t', mean_squared_error(grids2['XGBoost'].predict(X2_test), y2_test))

linear regression:	 1.87142927643744e+16
random forest:		 0.021569842324804966
XGBooster:		 0.018420174398776336


In [26]:
df_test_preprocessed = pipeline2.transform(df_test)

In [27]:
#xqboost submission
y_xgboost = grids2['XGBoost'].predict(df_test_preprocessed)
# giati htan log prin to kse-logarw
y_xgboost = np.exp(y_xgboost)

df_xgboost_out = df_test[['Id']].copy()
df_xgboost_out ['SalePrice'] = y_xgboost

In [28]:
#Submit
df_xgboost_out.to_csv('../data/submission.csv',index=False)