In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

### Data Collection

In [4]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
df_train = df_train[df_train['GarageYrBlt'] != -1]

In [5]:
y = df_train.pop('SalePrice')
X = df_train.copy()

### Data Preprocessing

In [7]:
# cols = ['BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'] 

In [8]:
# X = X[cols]

In [9]:
# X.info()

In [10]:
# X.iloc[:, :] = X.iloc[:, :].fillna(5.0)

In [11]:
X = X.drop(['GarageCars'], axis=1).fillna(0)

In [12]:
"""
def find_outliers_iqr(data, threshold=1.5):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr
    res = np.where((data < lower_bound) | (data > upper_bound))[0]
    print(f"number of outliers detected:{len(res)}")
    return res
"""

'\ndef find_outliers_iqr(data, threshold=1.5):\n    q1 = np.percentile(data, 25)\n    q3 = np.percentile(data, 75)\n    iqr = q3 - q1\n    lower_bound = q1 - threshold * iqr\n    upper_bound = q3 + threshold * iqr\n    res = np.where((data < lower_bound) | (data > upper_bound))[0]\n    print(f"number of outliers detected:{len(res)}")\n    return res\n'

In [13]:
models = {
    "LinearRegression": LinearRegression(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
}

In [14]:
def train_and_evaluate(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    return mae

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, mod in models.items():
    mae = train_and_evaluate(x_train, y_train, x_test, y_test, mod)
    print(f"{name} MAE: {mae}\n")


"""
for col in X.columns:
    outlier_indices = find_outliers_iqr(x_train[col])
    outlier_indices = pd.Series(outlier_indices)
    valid_outlier_indices = outlier_indices[outlier_indices.isin(x_train.index)]

    x_train_filtered = x_train.drop(index=valid_outlier_indices).reset_index(drop=True)
    y_train_filtered = y_train.drop(index=valid_outlier_indices).reset_index(drop=True)
    mae_filtered = train_and_evaluate(x_train_filtered, y_train_filtered, x_test, y_test)
    print(f"MAE after removing {col} outliers: {mae_filtered}")

print()
outlier_indices = []
for col in ['GarageYrBlt', 'WoodDeckSF']:
    outlier_indices = outlier_indices + find_outliers_iqr(x_train[col]).tolist()
    
indices = list(set(outlier_indices))
indices = pd.Series(indices)
valid_outlier_indices = indices[indices.isin(x_train.index)]

x_train_filtered = x_train.drop(index=valid_outlier_indices).reset_index(drop=True)
y_train_filtered = y_train.drop(index=valid_outlier_indices).reset_index(drop=True)
mae_filtered = train_and_evaluate(x_train_filtered, y_train_filtered, x_test, y_test)
print(f"MAE after removing {col} outliers: {mae_filtered}")
"""

LinearRegression MAE: 33888.35203616898

AdaBoostRegressor MAE: 34931.924708624894

DecisionTreeRegressor MAE: 39394.10859728507

RandomForestRegressor MAE: 26250.87211085973



'\nfor col in X.columns:\n    outlier_indices = find_outliers_iqr(x_train[col])\n    outlier_indices = pd.Series(outlier_indices)\n    valid_outlier_indices = outlier_indices[outlier_indices.isin(x_train.index)]\n\n    x_train_filtered = x_train.drop(index=valid_outlier_indices).reset_index(drop=True)\n    y_train_filtered = y_train.drop(index=valid_outlier_indices).reset_index(drop=True)\n    mae_filtered = train_and_evaluate(x_train_filtered, y_train_filtered, x_test, y_test)\n    print(f"MAE after removing {col} outliers: {mae_filtered}")\n\nprint()\noutlier_indices = []\nfor col in [\'GarageYrBlt\', \'WoodDeckSF\']:\n    outlier_indices = outlier_indices + find_outliers_iqr(x_train[col]).tolist()\n    \nindices = list(set(outlier_indices))\nindices = pd.Series(indices)\nvalid_outlier_indices = indices[indices.isin(x_train.index)]\n\nx_train_filtered = x_train.drop(index=valid_outlier_indices).reset_index(drop=True)\ny_train_filtered = y_train.drop(index=valid_outlier_indices).reset

### Model Building and Evaluation

In [16]:
"""
outlier_indices = []
for col in ['GarageYrBlt', 'WoodDeckSF']:
    outlier_indices = outlier_indices + find_outliers_iqr(X[col]).tolist()
    
indices = list(set(outlier_indices))
indices = pd.Series(indices)
valid_outlier_indices = indices[indices.isin(X.index)]

X_train_filtered = X.drop(index=valid_outlier_indices).reset_index(drop=True)
y_train_filtered = y.drop(index=valid_outlier_indices).reset_index(drop=True)
"""

"\noutlier_indices = []\nfor col in ['GarageYrBlt', 'WoodDeckSF']:\n    outlier_indices = outlier_indices + find_outliers_iqr(X[col]).tolist()\n    \nindices = list(set(outlier_indices))\nindices = pd.Series(indices)\nvalid_outlier_indices = indices[indices.isin(X.index)]\n\nX_train_filtered = X.drop(index=valid_outlier_indices).reset_index(drop=True)\ny_train_filtered = y.drop(index=valid_outlier_indices).reset_index(drop=True)\n"

In [17]:
model = RandomForestRegressor()

model.fit(X, y)

In [18]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

# X_test = X_test[cols]
X_test = X_test.drop(['GarageCars'], axis=1).fillna(0)

In [19]:
# X_test.info()

In [20]:
# X_test[X_test.isnull().any(axis=1)]

In [21]:
# X_test.iloc[:, :] = X_test.iloc[:, :].fillna(1.0)

### Generating Submission File

In [23]:
y_pred = model.predict(X_test)

In [24]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,278842.01
1,1105,203905.98
2,413,163904.0
3,522,192257.92
4,1036,178058.51
