In [24]:
# Nombre: Alejandro Velazco

import pandas as pd
from sklearn.impute import SimpleImputer

# Datos para Clasificación (magic04.data)
filename_classification = './magic04/magic04.data'
column_names = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
data_classification = pd.read_csv(filename_classification, names=column_names)
print(data_classification.head())

# Datos para Regresión (housing.csv)
filename_regression = './housing/housing.csv'
data_regression = pd.read_csv(filename_regression)
imputer = SimpleImputer(strategy="median")
data_numeric = data_regression.drop("ocean_proximity", axis=1)  # Eliminamos columnas no numéricas para la imputación
imputed_data = imputer.fit_transform(data_numeric)
data_imputed = pd.DataFrame(imputed_data, columns=data_numeric.columns)

# Agregar de nuevo la columna ocean_proximity
data_imputed["ocean_proximity"] = data_regression["ocean_proximity"]

# One-hot encoding para la columna ocean_proximity
data_regression = pd.get_dummies(data_imputed, columns=['ocean_proximity'])
print(data_regression.head())

    fLength    fWidth   fSize   fConc  fConc1     fAsym  fM3Long  fM3Trans  \
0   28.7967   16.0021  2.6449  0.3918  0.1982   27.7004  22.0110   -8.2027   
1   31.6036   11.7235  2.5185  0.5303  0.3773   26.2722  23.8238   -9.9574   
2  162.0520  136.0310  4.0612  0.0374  0.0187  116.7410 -64.8580  -45.2160   
3   23.8172    9.5728  2.3385  0.6147  0.3922   27.2107  -6.4633   -7.1513   
4   75.1362   30.9205  3.1611  0.3168  0.1832   -5.5277  28.5525   21.8393   

    fAlpha     fDist class  
0  40.0920   81.8828     g  
1   6.3609  205.2610     g  
2  76.9600  256.7880     g  
3  10.4490  116.7370     g  
4   4.6480  356.4620     g  
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0  

In [25]:
import time, random, numpy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor



seed = random.seed(time.time())
# Función para simplificar la evaluación de modelos
def evaluate_model(model, X_train, X_test, y_train, y_test, classification=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    if classification:
        return accuracy_score(y_test, y_pred)
    else:
        return numpy.sqrt(mean_squared_error(y_test, y_pred))

In [26]:
# Clasificación
X_classification = data_classification[data_classification.columns[:-1]]
Y_classification = data_classification['class']
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_classification, Y_classification, test_size=0.7, random_state=seed)

scaler_class = StandardScaler()
X_train_class = scaler_class.fit_transform(X_train_class)
X_test_class = scaler_class.transform(X_test_class)

# Modelos de clasificación
models_classification = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "Bagging_KNeighborsClassifier": BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=100, max_samples=0.1, random_state=seed),
    "AdaBoost_DecisionTreeClassifier": AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=50)
}

for name, model in models_classification.items():
    acc = evaluate_model(model, X_train_class, X_test_class, y_train_class, y_test_class)
    print(f"{name}: Accuracy = {acc:.4f}")

KNeighborsClassifier: Accuracy = 0.8296
Bagging_KNeighborsClassifier: Accuracy = 0.8149
AdaBoost_DecisionTreeClassifier: Accuracy = 0.8016


In [27]:
X_regression = data_regression.drop('median_house_value', axis=1)
Y_regression = data_regression['median_house_value']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, Y_regression, test_size=0.3, random_state=seed)

scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)

# Modelos de regresión
models_regression = {
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=seed),
    "Bagging_DecisionTreeRegressor": BaggingRegressor(estimator=DecisionTreeRegressor(random_state=seed), n_estimators=100, max_samples=0.1, random_state=seed),
    "AdaBoost_DecisionTreeRegressor": AdaBoostRegressor(estimator=DecisionTreeRegressor(random_state=seed), n_estimators=100)
}

for name, model in models_regression.items():
    rmse = evaluate_model(model, X_train_reg, X_test_reg, y_train_reg, y_test_reg, classification=False)
    print(f"{name}: RMSE = {rmse:.4f}")

DecisionTreeRegressor: RMSE = 68498.0946
Bagging_DecisionTreeRegressor: RMSE = 54433.6772
AdaBoost_DecisionTreeRegressor: RMSE = 47341.5859
