<center><h1 style="color:#CC0099">House-prices-modeling</h1></center>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import sys
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

<h1>Model Building: <span style="color:#6666CC">Model Training</span></h1>

### 1. Dataset loading and splitting into train and test

In [2]:
!ls ../data

test.csv  train.csv


In [3]:
# dont use absoulte path, use relative one .../ !!!
df_master = pd.read_csv('../data/train.csv')
df = df_master.copy()
label_col = 'SalePrice'

In [4]:
train_df, test_df = train_test_split(df, test_size=0.33, random_state=42)

### 2. Preprocessing and feature engineering of the train set

In [5]:
#df_master.info()

In [6]:
# Keeping only useful features
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
df = df[useful_features + [label_col]]
df.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
0,PConc,Gd,8,0,2008,856,208500
1,CBlock,TA,6,298,2007,1262,181500
2,PConc,Gd,6,0,2008,920,223500
3,BrkTil,Gd,7,0,2006,961,140000
4,PConc,Gd,9,192,2008,1145,250000


- Continuous Features: TotRmsAbvGr, WoodDeckSF, YrSold, 1stFlrSF, SalePrice

- Categorical Features: Foundation, KitchenQual

In [7]:
print(sys.executable)

/Users/paulayagoesparza/anaconda3/envs/ml/bin/python


- Feature processing

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Foundation    1460 non-null   object
 1   KitchenQual   1460 non-null   object
 2   TotRmsAbvGrd  1460 non-null   int64 
 3   WoodDeckSF    1460 non-null   int64 
 4   YrSold        1460 non-null   int64 
 5   1stFlrSF      1460 non-null   int64 
 6   SalePrice     1460 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 80.0+ KB


In [9]:
# Check for duplicates 
#help(train_df[useful_features].duplicated)

In [10]:
train_df[useful_features].duplicated(keep='first').sum()

15

In [11]:
train_df = train_df.reset_index(drop=True)
train_df.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
973,1096,20,RL,78.0,9317,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2007,WD,Normal,176432
974,1131,50,RL,65.0,7804,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,12,2009,WD,Normal,135000
975,1295,20,RL,60.0,8172,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Normal,115000
976,861,50,RL,55.0,7642,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,,0,6,2007,WD,Normal,189950
977,1127,120,RL,53.0,3684,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2009,WD,Normal,174000


In [12]:
#df_tmp = df[useful_features+[label_col]]
#df_tmp[df_tmp.duplicated(keep=False)]
#df_tmp = df[useful_features+[label_col]]
#df_tmp[df_tmp[useful_features].duplicated(keep=False)]
#df_tmp2 = df_tmp[df_tmp[useful_features].duplicated(keep=False)]
#df_tmp2[(df_tmp2['Foundation'] == 'PConc') & (df_tmp2['KitchenQual'] == 'Gd') & (df_tmp2['TotRmsAbvGrd'] == 6)]

- Scaling continuous features

In [13]:
#continuous_columns = train_df[useful_features].select_dtypes(include='number').columns
continuous_columns = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

In [14]:
# fit on the train dataframe
scaler = StandardScaler()
scaler.fit(train_df[continuous_columns])

# transform on the train dataset 
scaled_train_columns = scaler.transform(train_df[continuous_columns])
scaled_train_df = pd.DataFrame(data=scaled_train_columns, columns=continuous_columns)

- Encoding categorical variables

In [15]:
#categorical_columns = train_df[useful_features].select_dtypes(include='object').columns
categorical_columns = ['Foundation', 'KitchenQual']

In [16]:
# using OneHotEncoder
encoder = OneHotEncoder()

# fit on the train dataset
encoder.fit(train_df[categorical_columns])

# transform on the train dataset 
encoded_train_columns = encoder.transform(train_df[categorical_columns])
encoded_train_df = pd.DataFrame(data=encoded_train_columns.toarray(), columns=encoder.get_feature_names_out(categorical_columns))

In [17]:
X_train = pd.concat([scaled_train_df, encoded_train_df], axis=1)
y_train = train_df[label_col]

### 3. Model training
Using Linear Regression

In [18]:
model = LinearRegression()

In [19]:
model.fit(X_train, y_train)

<h1>Model Building: <span style="color:#6666CC">Model Evaluation</span></h1>

### 1. Preprocessing and feature engineering of the test set

In [20]:
test_df[useful_features].duplicated(keep='first').sum()
test_df = test_df.reset_index(drop=True)

#### Scaling continuous features

In [21]:
# transform on the test dataset 
scaled_test_columns = scaler.transform(test_df[continuous_columns])
scaled_test_df = pd.DataFrame(data=scaled_test_columns, columns=continuous_columns)

####  Encoding categorical variables

In [22]:
# transform on the test dataset 
encoded_test_columns = encoder.transform(test_df[categorical_columns])
encoded_test_df = pd.DataFrame(data=encoded_test_columns.toarray(), columns=encoder.get_feature_names_out(categorical_columns))

In [23]:
X_test = pd.concat([scaled_test_df, encoded_test_df], axis=1)
y_test = test_df[label_col]

### 2. Model predictions on the test set 

In [24]:
y_pred = model.predict(X_test)

In [25]:
y_pred[y_pred < 0] = 0

### 3. Model evaluation

In [26]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [27]:
compute_rmsle(y_test, y_pred)

0.22

<h1 style="color:#6666CC">Model inference</h1>

### 1. Reading data from a given file (test.csv file in your case)

In [28]:
test_data = pd.read_csv('../data/test.csv')

### 2. Preprocessing and feature engineering of this data

In [29]:
test_data = test_data[useful_features]
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Foundation    1459 non-null   object
 1   KitchenQual   1458 non-null   object
 2   TotRmsAbvGrd  1459 non-null   int64 
 3   WoodDeckSF    1459 non-null   int64 
 4   YrSold        1459 non-null   int64 
 5   1stFlrSF      1459 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 68.5+ KB


In [30]:
# Replace missing values in categorical columns with mode
for col in test_data[categorical_columns]:
    test_data[col] = test_data[col].fillna(test_data[col].mode().iloc[0])

# Replace missing values in continuous columns with mean
for col in test_data[continuous_columns]:
    test_data[col] = test_data[col].fillna(test_data[col].mean())

In [31]:
test_data = test_data.reset_index(drop=True)

#scaling
scaled_test_data_columns = scaler.transform(test_data[continuous_columns])
scaled_test_data = pd.DataFrame(data=scaled_test_data_columns, columns=continuous_columns)
#encoding
encoded_test_data_columns = encoder.transform(test_data[categorical_columns])
encoded_test_data = pd.DataFrame(data=encoded_test_data_columns.toarray(), columns=encoder.get_feature_names_out(categorical_columns))

test_dataset = pd.concat([scaled_test_data, encoded_test_data], axis=1)

### 3. Predicting the house prices of this data

In [36]:
predictions = model.predict(test_dataset)

object persistance: encoder, scaler and the model 
all the objects used in transformation have to be stored locally
model directory en github! 

In [None]:
# archivo .py 
# dentro de carpeta 

def build_model(dataframe):
    #first categorial features 
    # definirlas a mano!! porque puede haber columnas numéricas que son categorical
    df_categorical_features = 
    pass 


# activar siempre el environment!!


folder notebooks = driver.ipynb wilflow pipiline-extraction modeling




In [None]:
import sys
sys.path

# the path of python, so it wont find files that are one step before 

In [None]:
sys.path.append('..')

In [None]:
sys.path

cuanod hay un error en la funcion, para no tener que restart el kernel:

In [None]:
%load_ext autoreload
%autoreload 2
# mejorar con el codigo de teams 