# Introduction

We are going to use a neural network for predict the value of a house. Because I want the model to work fast, I will use SVD (singular value decomposition) at the beginning, this is not necessary, but because we use less variables the epochs will go very fast.
+ we saw this formula in: Useful_python_functions_ML >>(6) Features relations>>(2) Singular value decomposition
+ this is the link: https://github.com/robertofuentesr/Useful_python_functions_ML/tree/main/(6)%20Features%20relations 

In [1]:
#pip install tensorflow

In [2]:
#pip install --upgrade keras

In [3]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
# new library, we haven't used this one before in this repo
from sklearn.preprocessing import TargetEncoder

# Import Keras and other new libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.decomposition import TruncatedSVD
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [4]:
# Read the data
# This data you can find here: https://www.kaggle.com/c/home-data-for-ml-course/data

X_full = pd.read_csv('train.csv', index_col='Id')

# SalePrice is the target, if there is no target eliminate row associated with it
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X = X_full.copy()
X.drop(['SalePrice'], axis=1, inplace=True)



In [5]:
# we saw this formula in: Useful_python_functions_ML >>(6) Features relations>>(2) Singular value decomposition
# this is the link: https://github.com/robertofuentesr/Useful_python_functions_ML/tree/main/(6)%20Features%20relations 
def transforming_svd(X,y):
    X, X_test, y, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                random_state=0)  
    # for SVD it is best to scale all numerical values.
    
    numerical_col = [col for col in X.columns if str(X[col].dtypes)!='object' ]
    categorical_col = [col for col in X.columns if str(X[col].dtypes)=='object' ]
    

    numerical_transformer = Pipeline(
    steps=[("scaler", StandardScaler()), ("imputer", KNNImputer(n_neighbors=3))
      ]
        )
    categorical_transformer =  Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=
        [("numerical_transformer", numerical_transformer, numerical_col),
        ("categorical_transformer", categorical_transformer, categorical_col)],remainder='passthrough')


    # Create principal components
    svd = TruncatedSVD(n_components=len(X.columns), n_iter=7, random_state=42)
    
    # Bundle preprocessing and modeling code in a pipeline
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', svd)
                         ])

    
    X_svd = pipe.fit_transform(X)
    
    X_test_svd = pipe.transform(X_test)
    
    return  X_svd,X_test_svd,y, y_test

In [6]:
X_svd,X_test_svd,y, y_test = transforming_svd(X,y)

In [7]:
# Convert to dataframe
def convert_svd_df(X_svd):
    component_names = [f"svd{i+1}" for i in range(X_svd.shape[1])]
    X_svd = pd.DataFrame(X_svd, columns=component_names)
    return X_svd

X_svd = convert_svd_df(X_svd)
X_test_svd = convert_svd_df(X_test_svd)

In [8]:
# we saw this in a previous notebook this number get the 80% variance of the data and work fairly well
number_components = 31
X_svd = X_svd[X_svd.columns[0:number_components]]
X_test_svd = X_test_svd[X_test_svd.columns[0:number_components]]

In [9]:
X_svd.head()

Unnamed: 0,svd1,svd2,svd3,svd4,svd5,svd6,svd7,svd8,svd9,svd10,...,svd22,svd23,svd24,svd25,svd26,svd27,svd28,svd29,svd30,svd31
0,5.178012,4.994241,-0.477955,1.007098,3.695736,-0.394786,-0.457406,-3.141178,-0.100906,0.95418,...,0.903267,1.687794,-0.357427,0.087701,-0.013855,0.82191,0.108754,-1.27787,0.408873,0.07715
1,5.44095,-3.762135,-0.679413,-0.660386,1.928928,-0.073712,-0.85714,-0.27654,-0.444606,0.104564,...,0.13399,0.185997,0.887046,-0.002589,-0.076377,-0.068738,0.409405,-0.056461,-0.265974,0.294951
2,5.454435,-2.945801,-1.658974,0.741066,-0.372962,-0.759533,1.816474,-0.083871,-0.946973,0.359699,...,0.739886,0.313619,0.227438,0.390536,0.260251,-0.258716,0.227238,1.311141,0.231653,0.557195
3,5.47487,4.305689,-2.535842,0.919317,-0.082051,0.526221,0.585428,-0.366155,-0.351103,-0.144899,...,-0.464379,-0.759385,0.17699,-0.212306,-0.796845,-0.633266,0.286464,0.424999,0.203175,0.447677
4,5.654782,4.019248,-0.950836,-0.123134,2.930008,-1.082044,-0.661603,1.635362,0.072457,-0.45623,...,-1.981239,-0.192027,-2.23786,0.636678,-0.268383,0.225547,-0.913481,0.632468,0.470152,0.389773


In [10]:
#X_svd = np.asarray(X_svd).astype('float32')
#X_test = np.asarray(X_test).astype('float32')
model = Sequential()
callback  = keras.callbacks.EarlyStopping(monitor='loss',
                                              patience=60)
# input_shape=(number_components,) this is the first input layer
# the 500 nodes is the first hidden layer
model.add(Dense(200, input_shape=(len(X_svd.columns),), kernel_initializer='normal', activation='relu'))
model.add(Dense(300, kernel_initializer='normal'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_absolute_error', optimizer='adam')

history = model.fit(X_svd, y, epochs=500, batch_size=256,callbacks=[callback],verbose=0)

# Evaluate the model on the test set
print(model.evaluate(X_test_svd, y_test))
print(len(history.history['loss']) ) 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 18903.5176 
20778.15234375
500


In [11]:
model.summary()

In [12]:
model.evaluate(X_test_svd, y_test)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 948us/step - loss: 18903.5176


20778.15234375

We didn't improve our predictions. Maybe this is not the best model to fit in a small dataset.