In [None]:
!pip install sweetviz
!pip install h2o

Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=de3e0d27a88b4fad208059bfba9ba06add4788b3f21d80cb18ef0f7576c3992f
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Data Preprocessing**

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/gdrive/MyDrive/CMPE-255/Assignment 5/Data Set/Task_1.csv')


In [None]:
# Display the first few rows of the dataset
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
# Data Imputation

# Fill numerical columns with their median
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with their mode
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Check if there are any remaining missing values
missing_values_after_imputation = df.isnull().sum().sum()

missing_values_after_imputation


0

**Anomaly elimnations**


In [None]:
# Anomaly Detection using IQR

def detect_outliers(dataframe, column):
    """Detect outliers using IQR."""
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)]

# Detecting outliers in all numerical columns
outliers_detected = {}
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    outliers = detect_outliers(df, col)
    if not outliers.empty:
        outliers_detected[col] = len(outliers)

outliers_detected

{'MSSubClass': 103,
 'LotFrontage': 106,
 'LotArea': 69,
 'OverallQual': 2,
 'OverallCond': 125,
 'YearBuilt': 7,
 'MasVnrArea': 98,
 'BsmtFinSF1': 7,
 'BsmtFinSF2': 167,
 'BsmtUnfSF': 29,
 'TotalBsmtSF': 61,
 '1stFlrSF': 20,
 '2ndFlrSF': 2,
 'LowQualFinSF': 26,
 'GrLivArea': 31,
 'BsmtFullBath': 1,
 'BsmtHalfBath': 82,
 'BedroomAbvGr': 35,
 'KitchenAbvGr': 68,
 'TotRmsAbvGrd': 30,
 'Fireplaces': 5,
 'GarageYrBlt': 1,
 'GarageCars': 5,
 'GarageArea': 21,
 'WoodDeckSF': 32,
 'OpenPorchSF': 77,
 'EnclosedPorch': 208,
 '3SsnPorch': 24,
 'ScreenPorch': 116,
 'PoolArea': 7,
 'MiscVal': 52,
 'SalePrice': 61}

In [None]:
# Capping outliers using IQR boundaries

def cap_outliers(dataframe, column):
    """Cap outliers using IQR boundaries."""
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    dataframe[column] = dataframe[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))
    return dataframe

# Capping outliers in detected columns
for col in outliers_detected.keys():
    df = cap_outliers(df, col)

# Re-checking for outliers after capping
outliers_after_capping = {}
for col in outliers_detected.keys():
    outliers = detect_outliers(df, col)
    if not outliers.empty:
        outliers_after_capping[col] = len(outliers)

outliers_after_capping

{}

**Feature Processing**

In [None]:
# Feature Processing using One-Hot Encoding
df_encoded = pd.get_dummies(df, drop_first=True)

# Displaying the shape after encoding and the first few rows of the encoded dataset
encoded_shape = df_encoded.shape
df_encoded_head = df_encoded.head()

encoded_shape, df_encoded_head


((1460, 247),
    Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
 0   1        60.0         65.0   8450.0          7.0          5.0     2003.0   
 1   2        20.0         80.0   9600.0          6.0          7.5     1976.0   
 2   3        60.0         68.0  11250.0          7.0          5.0     2001.0   
 3   4        70.0         60.0   9550.0          7.0          5.0     1915.0   
 4   5        60.0         84.0  14260.0          8.0          5.0     2000.0   
 
    YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  SaleType_ConLI  SaleType_ConLw  \
 0          2003       196.0       706.0  ...               0               0   
 1          1976         0.0       978.0  ...               0               0   
 2          2002       162.0       486.0  ...               0               0   
 3          1970         0.0       216.0  ...               0               0   
 4          2000       350.0       655.0  ...               0               0   
 
    SaleTy

**Feature Selection**

In [None]:
# Feature Selection based on correlation with SalePrice

# Calculate correlation with SalePrice
correlation = df_encoded.corr()['SalePrice'].sort_values(ascending=False)

# Select features with a correlation higher than 0.5 (in absolute value) with SalePrice
selected_features = correlation[correlation.abs() > 0.5].index.tolist()

# Excluding the target variable itself
selected_features.remove('SalePrice')

selected_features


['OverallQual',
 'GrLivArea',
 'GarageCars',
 'GarageArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'FullBath',
 'YearBuilt',
 'YearRemodAdd',
 'TotRmsAbvGrd',
 'Foundation_PConc',
 'ExterQual_Gd',
 'BsmtQual_TA',
 'KitchenQual_TA',
 'GarageFinish_Unf',
 'ExterQual_TA']

Used the sweetviz autoEDA to Perform the EDA
**bold text**

In [None]:
import sweetviz as sv

report = sv.analyze(df)
report.show_html('/content/gdrive/MyDrive/CMPE-255/Assignment 5/Data Set/report.html')

                                             |          | [  0%]   00:00 -> (? left)

Report /content/gdrive/MyDrive/CMPE-255/Assignment 5/Data Set/report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


**Used the H2O Auto ML to build various ml models**

In [None]:
import h2o

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpg10_c0if
  JVM stdout: /tmp/tmpg10_c0if/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpg10_c0if/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,13 days
H2O_cluster_name:,H2O_from_python_unknownUser_xmqzey
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [None]:
from h2o.automl import H2OAutoML

# Convert DataFrame to H2O Frame
hf = h2o.H2OFrame(df)

# Split the data
train, valid, test = hf.split_frame([0.7, 0.15], seed=1234)

# Define X and y
y = 'SalePrice'
X = hf.columns
X.remove(y)

# Run AutoML
aml = H2OAutoML(max_runtime_secs=600, seed=1)
aml.train(x=X, y=y, training_frame=train, validation_frame=valid)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
07:06:59.426: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
07:06:59.592: _train param, Dropping bad and constant columns: [KitchenAbvGr, Utilities, BsmtFinSF2, LowQualFinSF, PoolArea, BsmtHalfBath, EnclosedPorch, MiscVal, 3SsnPorch, ScreenPorch]

██
07:07:07.492: _train param, Dropping bad and constant columns: [KitchenAbvGr, Utilities, BsmtFinSF2, LowQualFinSF, PoolArea, BsmtHalfBath, EnclosedPorch, MiscVal, 3SsnPorch, ScreenPorch]


07:07:08.955: _train param, Dropping bad and constant columns: [KitchenAbvGr, Utilities, BsmtFinSF2, LowQualFinSF, PoolArea, BsmtHalfBath, EnclosedPorch, MiscVal, 3SsnPorch, ScreenPorch]

██
07:07:15.819: _train param, Dropping unused

key,value
Stacking strategy,cross_validation
Number of base models (used / total),4/6
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# DeepLearning base models (used / total),1/1
# DRF base models (used / total),1/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,12571.226,1161.7228,12978.1875,14099.259,12338.495,12555.225,10884.964
mean_residual_deviance,353651008.0,90256928.0,407885152.0,469725248.0,319037024.0,339497696.0,232109920.0
mse,353651008.0,90256928.0,407885152.0,469725248.0,319037024.0,339497696.0,232109920.0
null_deviance,943752410000.0,114484822000.0,943055310000.0,1032283030000.0,1016415780000.0,748548850000.0,978459100000.0
r2,0.9213133,0.0211604,0.9091111,0.9069649,0.9346886,0.9037082,0.9520937
residual_deviance,72995783000.0,18723414000.0,85655880000.0,95354225000.0,66359702000.0,70955016000.0,46654091000.0
rmse,18678.307,2442.2935,20196.166,21673.146,17861.607,18425.463,15235.154
rmsle,0.1121501,0.0202117,0.1366672,0.1139088,0.1083538,0.1204396,0.0813809


**Leaderboard and Model Selection**

In [21]:
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

model_id                                                   rmse          mse      mae     rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_6_AutoML_1_20231030_70659  18878.3  3.56389e+08  12603.5  0.113722               3.56389e+08
StackedEnsemble_BestOfFamily_4_AutoML_1_20231030_70659  18878.4  3.56394e+08  12638.3  0.113441               3.56394e+08
StackedEnsemble_AllModels_6_AutoML_1_20231030_70659     18928.1  3.58275e+08  12441.9  0.114649               3.58275e+08
StackedEnsemble_AllModels_3_AutoML_1_20231030_70659     18969.2  3.5983e+08   12606.6  0.115977               3.5983e+08
StackedEnsemble_AllModels_4_AutoML_1_20231030_70659     19008.8  3.61334e+08  12495.6  0.115116               3.61334e+08
StackedEnsemble_AllModels_5_AutoML_1_20231030_70659     19302.5  3.72585e+08  13153.3  0.119503               3.72585e+08
StackedEnsemble_AllModels_2_AutoML_1_20231030_70659     19607.9  3.84469e+08  13134.3  0.118765               3.84469e+08
StackedEnsemble_BestOfFam

In [22]:
preds = aml.leader.predict(test)


stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
