In [16]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'home-data-for-ml-course:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F10211%2F111096%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240524%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240524T224016Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D80604e0715dad3f2b61807c658f8277d1a7b68fd00598cfc45437f5dcd75c1c29209760a67f6b3141b35594b358499f8efe1869442776f6686064c31309ee91203538cf65aa298e6bf42add573df527b22b9f9d29477b8661dd0351040164522942b16e5ca3807e92cd9759429fca12f0e0851d7696ad56b6225f2181b79384028308dff647a0ada2ca6c35137d08fe511693fcd7ffc7297d1f4293ea1a4becb196098d78290362fd40edcf7b433af626387ced9e314ffb2fe3a8c0df0e0eac89a0d73e5b95b3e78e142635c658fae3d6c7e9d37ec051a91cd7a5b3162c5f073c1ce028a436435dfa5549a63e7a95871493adf423b505ef42ef07c5d236acc0f,mobile-price-classification:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F11167%2F15520%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240524%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240524T224017Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2372f916db77f7be54decdc4a9a1f56df75c63e1f569fed01d85692dcbd9fcf618a15a4181a4bcb3ecf769ee1497b689240b8449a414cf5427bda4e572948e6722a64f4669513fba1bffd4f090ae98fda98be1ca3ef78b6df36167a3f1055bd990d24d3bc5e149252ca437bda2db935c9cbaf253ef3a85e6696be1262a04a45538fd389126dcf3bf67e830814dbf8907e618f3d3d92c1698ca7fe9f2f1a4b2a92867c5c5f4515ed669660109c5aca0c792071cb88d634e329041b6469b1f5dd85fd3bec7f750e413f60897e31d869c41b62198ff17eb7292fc54db4267b451f32671c970d00506416007aee18e2a1d2731b02fe479a090cf2322400cf30c0c2f,melbourne-housing-snapshot:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2709%2F38454%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240524%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240524T224017Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D143eca6fc43e85667a777e87dbc6ebfcfe1ba8083091793f56b7144d1b65e33f2d02669c06fe58772fded6e8d539146dba93dd01627ac195d24b7f71abe32d1ca5098ae2653613488c5b400c586cddb38e8083f86b42ca54090a750ee4bd11685d21a203a75f2f25ede4cc0a9625794e9c179cbb96ef73e200811017147d0f94cfbdea1ec976a28aaa7441d01ea103e85cc08389b873f805634a9323e8e1d314aa367785de9c81ad0388a9e7e52b78de41ffacc7bc197fef95a41926bbff5529c813f7080822f6d8775dee97c681f3bd73634b456d7a3e5c4cc50208703c7386c5dc087c5b4b3b5ea50bbfdb4195654601415503141e56122818480577762844'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

Downloading home-data-for-ml-course, 395336 bytes compressed
Downloaded and uncompressed: home-data-for-ml-course
Downloading mobile-price-classification, 72340 bytes compressed
Downloaded and uncompressed: mobile-price-classification
Downloading melbourne-housing-snapshot, 461423 bytes compressed
Downloaded and uncompressed: melbourne-housing-snapshot
Data source import complete.


**EXPLORACIÓN BASICA DE DATOS**

Paso 1: cargar datos
Lea el archivo de datos de Iowa en un Pandas DataFrame llamado home_data

In [21]:
import pandas as pd
iowa_file_path = '../input/home-data-for-ml-course/train.csv'

home_data = pd.read_csv(iowa_file_path)

Paso 2: revisar los datos
Utilice el comando que aprendió para ver estadísticas resumidas de los datos. Luego complete las variables para responder las siguientes preguntas

In [22]:
# Print summary statistics in next lin
datos_home = home_data.describe()
print(datos_home)

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

In [23]:
# What is the average lot size (rounded to nearest integer)?
avg_lot_size = 10517

# As of today, how old is the newest home (current year - the date in which it was built)
newest_home_age = 14

**PRIMER MODELO DE MACHINE LEARNING**

Ejercicios Paso 1: Especificar el objetivo de predicción Seleccione la variable objetivo, que corresponde al precio de venta. Guarde esto en una nueva variable llamada y. Deberá imprimir una lista de las columnas para encontrar el nombre de la columna que necesita.

In [24]:
# print the list of columns in the dataset to find the name of the prediction target
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [25]:
y = home_data.SalePrice

Paso 2: crea X

Ahora creará un DataFrame llamado X que contiene las funciones predictivas.

Como solo desea algunas columnas de los datos originales, primero creará una lista con los nombres de las columnas que desea en X.

Utilizará solo las siguientes columnas en la lista (puede copiar y pegar la lista completa para ahorrar algo de escritura, aunque aún necesitará agregar comillas):

Área del lote

Año de construcción

1er piso SF

2do piso SF

Baño Completo

DormitorioAbvGr

TotRmsAbvGrd

Una vez que haya creado esa lista de características, úsela para crear el DataFrame que usará para ajustarse al modelo.

In [26]:
# Create the list of features below
feature_names = ["LotArea","YearBuilt","1stFlrSF","2ndFlrSF","FullBath","BedroomAbvGr","TotRmsAbvGrd"]

# Select data corresponding to features in feature_names
X = home_data[feature_names]

REVIEW DATA

In [27]:
# Review data
# print description or statistics from X
print(X)

# print the top few lines
print(X.head())

      LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0        8450       2003       856       854         2             3   
1        9600       1976      1262         0         2             3   
2       11250       2001       920       866         2             3   
3        9550       1915       961       756         1             3   
4       14260       2000      1145      1053         2             4   
...       ...        ...       ...       ...       ...           ...   
1455     7917       1999       953       694         2             3   
1456    13175       1978      2073         0         2             3   
1457     9042       1941      1188      1152         2             4   
1458     9717       1950      1078         0         1             2   
1459     9937       1965      1256         0         1             3   

      TotRmsAbvGrd  
0                8  
1                6  
2                6  
3                7  
4                9  
...      

Paso 3: especificar y ajustar el modelo
Cree un DecisionTreeRegressor y guárdelo iowa_model. Asegúrese de haber realizado la importación relevante desde sklearn para ejecutar este comando.

Luego, ajuste el modelo que acaba de crear utilizando los datos en X e y que guardó anteriormente.

In [28]:
from sklearn.tree import DecisionTreeRegressor
#specify the model.
#For model reproducibility, set a numeric value for random_state when specifying the model
iowa_model = DecisionTreeRegressor(random_state=1)

# Fit the model
iowa_model.fit(X,y)

Paso 4: hacer predicciones
Haga predicciones con el comando de predicción del modelo utilizando X como datos. Guarde los resultados en una variable llamada predicciones

In [29]:
predictions = iowa_model.predict(X)
print(predictions)

[208500. 181500. 223500. ... 266500. 142125. 147500.]


**VALIDACIÓN DEL MODELO**

In [30]:
print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes: [208500, 181500, 223500, 140000, 250000]


Paso 1: divide tus datos
Utilice la función train_test_split para dividir sus datos.

Déle el argumento random_state=1 para que las funciones de verificación sepan qué esperar al verificar su código.

Recuerde, sus funciones se cargan en DataFrame X y su objetivo se carga en y.

In [31]:
# Import the train_test_split function and uncomment
from sklearn.model_selection import train_test_split

# fill in and uncomment
train_X, val_X, train_y, val_y = train_test_split(X, y , random_state=1)

Paso 2: especificar y ajustar el modelo
Cree un modelo DecisionTreeRegressor y ajústelo a los datos relevantes. Establezca random_state en 1 nuevamente al crear el modelo.

In [32]:
# Specify the model
iowa_model = DecisionTreeRegressor(random_state=1)

# Fit iowa_model with the training data.
iowa_model.fit(train_X, train_y)

Paso 3: hacer predicciones con datos de validación

In [35]:
# Predict with all validation observations
val_predictions = iowa_model.predict(val_X)
print(val_predictions)

[186500. 184000. 130000.  92000. 164500. 220000. 335000. 144152. 215000.
 262000. 180000. 121000. 175900. 210000. 248900. 131000. 100000. 149350.
 235000. 156000. 149900. 265979. 193500. 377500. 100000. 162900. 145000.
 180000. 582933. 146000. 140000.  91500. 112500. 113000. 145000. 312500.
 110000. 132000. 305000. 128000. 162900. 115000. 110000. 124000. 215200.
 180000.  79000. 192000. 282922. 235000. 132000. 325000.  80000. 237000.
 208300. 100000. 120500. 162000. 153000. 187000. 185750. 335000. 129000.
 124900. 185750. 133700. 127000. 230000. 146800. 157900. 136000. 153575.
 335000. 177500. 143000. 202500. 168500. 105000. 305900. 192000. 190000.
 140200. 134900. 128950. 213000. 108959. 149500. 190000. 175900. 160000.
 250580. 157000. 120500. 147500. 118000. 117000. 110000. 130000. 148500.
 148000. 190000. 130500. 127000. 120500. 135000. 168000. 176432. 128000.
 147000. 260000. 132000. 129500. 171000. 181134. 227875. 189000. 282922.
  94750. 185000. 194000. 159000. 279500. 290000. 13

Paso 4: Calcule el error absoluto medio en los datos de validación

In [34]:
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y, val_predictions)

# uncomment following line to see the validation_mae
print(val_mae)

29652.931506849316


**Desajuste y Sobreajuste**

In [36]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

Paso 1: compare diferentes tamaños de árboles
Escriba un bucle que pruebe los siguientes valores para max_leaf_nodes a partir de un conjunto de valores posibles.

Llame a la función get_mae en cada valor de max_leaf_nodes. Almacene la salida de alguna manera que le permita seleccionar el valor de max_leaf_nodes que proporcione el modelo más preciso para sus datos.

In [38]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes

scores = {leaf_node: get_mae(leaf_node, train_X, val_X, train_y, val_y) for leaf_node in candidate_max_leaf_nodes}


# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size =  min(scores, key=scores.get)
print(best_tree_size)

100


Paso 2: Ajustar el modelo usando todos los datos
Ya sabes cuál es el mejor tamaño de árbol. Si fuera a implementar este modelo en la práctica, lo haría aún más preciso utilizando todos los datos y manteniendo ese tamaño de árbol. Es decir, no es necesario que muestre los datos de validación ahora que ha tomado todas las decisiones de modelado.

In [39]:
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes = best_tree_size,random_state=1)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)

**RANDOM FORESTS**

Paso1: Usar Random Forest

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X,train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
melb_preds = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y,melb_preds)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

Validation MAE for Random Forest Model: 21857.15912981083
