In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Data Modeling
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Data Visualization
from scipy.stats import iqr
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_data=pd.read_csv('train.csv',keep_default_na=False,na_values=[" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NULL", "NaN", "n/a", "nan", "null","NA"])
default_NA={'Alley':'NA','BsmtQual':'NA','BsmtCond':'NA','BsmtExposure':'NA',
            'BsmtFinType1':'NA','BsmtFinType2':'NA','FireplaceQu':'NA',
            'GarageType':'NA','GarageFinish':'NA',
            'GarageQual':'NA','GarageCond':'NA','PoolQC':'NA','Fence':'NA','MiscFeature':'NA'}
train_data=train_data.fillna(default_NA)

In [3]:
nulls = train_data.isnull().sum().to_frame()
for index, row in nulls.iterrows():
    if row[0] > 0:
        print(index, row[0])
    else:
        continue

LotFrontage 259
MasVnrType 8
MasVnrArea 8
Electrical 1
GarageYrBlt 81


In [4]:
from eda_imputer_selection import eda_feature_exp
data=train_data.loc[:,['Neighborhood','LotArea','LandContour','BldgType','HouseStyle','OverallQual','MSZoning']]
feature_exp=eda_feature_exp(data.loc[:,['Neighborhood','LotArea','LandContour','BldgType','HouseStyle','OverallQual']].columns,'MSZoning')
data=feature_exp.fit_transform(data)

set()
Dropped last category to avoid dummy trap: Neighborhood_Veenker
Dropped last category to avoid dummy trap: BldgType_Twnhs
Dropped last category to avoid dummy trap: HouseStyle_SLvl
{'RP': 0, 'FV': 1, 'I': 2, 'RM': 3, 'A': 4, 'RH': 5, 'RL': 6, 'C (all)': 7}


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold,KFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=4)
importances = np.zeros(len(data.loc[:,data.columns!='MSZoning'].columns))

In [6]:
# Perform cross-validation
X=data.loc[:,data.columns!='MSZoning']
y=data.loc[:,'MSZoning']
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialize and train the Random Forest model
    model = RandomForestClassifier(class_weight='balanced')
    model.fit(X_train, y_train)
    
    # Accumulate feature importances
    importances += model.feature_importances_

# Average feature importances across folds
importances /= kf.get_n_splits()

# Create a DataFrame for the feature importances
feature_importances = pd.DataFrame({'Feature': data.loc[:,data.columns!='MSZoning'].columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances)

                 Feature  Importance
21  Neighborhood_Somerst    0.177044
35               LotArea    0.160740
37           OverallQual    0.118988
9    Neighborhood_IDOTRR    0.113180
17  Neighborhood_OldTown    0.071856
20  Neighborhood_SawyerW    0.042072
36           LandContour    0.037290
18    Neighborhood_SWISU    0.036934
24         BldgType_1Fam    0.026428
12    Neighborhood_NAmes    0.025647
31     HouseStyle_2Story    0.022205
7   Neighborhood_Edwards    0.016079
29     HouseStyle_1.5Fin    0.015863
28     HouseStyle_1Story    0.015468
3   Neighborhood_BrkSide    0.012693
25       BldgType_2fmCon    0.011844
27       BldgType_TwnhsE    0.011197
33     HouseStyle_2.5Unf    0.011174
26       BldgType_Duplex    0.010950
5   Neighborhood_CollgCr    0.008807
6   Neighborhood_Crawfor    0.008056
34     HouseStyle_SFoyer    0.005008
30     HouseStyle_1.5Unf    0.004903
2    Neighborhood_BrDale    0.004658
10  Neighborhood_MeadowV    0.004232
8   Neighborhood_Gilbert    0.003786
1

In [7]:
from feature_selection import get_feature_importance,get_best_features,calculate_vif
feature_importances =get_feature_importance(X,importances)#pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_classifier.feature_importances_}).sort_values(by='Importance', ascending=False)#pd.Series(rf_classifier.feature_importances_, index=X_train.columns).sort_values(ascending=False)
#feature_importances['Cumulative Importance'] = feature_importances['Importance'].cumsum()
feature_importances.to_csv('best_features_MSZoning.csv')
top_features=get_best_features(feature_importances,0.8)
print(top_features)
print("Top 10 important features:")
print(feature_importances.head(15))

['Neighborhood_Somerst', 'LotArea', 'OverallQual', 'Neighborhood_IDOTRR', 'Neighborhood_OldTown', 'Neighborhood_SawyerW', 'LandContour', 'Neighborhood_SWISU', 'BldgType_1Fam', 'Neighborhood_NAmes']
Top 10 important features:
                 Feature  Importance  Cumulative Importance
21  Neighborhood_Somerst    0.177044               0.177044
35               LotArea    0.160740               0.337784
37           OverallQual    0.118988               0.456772
9    Neighborhood_IDOTRR    0.113180               0.569952
17  Neighborhood_OldTown    0.071856               0.641807
20  Neighborhood_SawyerW    0.042072               0.683880
36           LandContour    0.037290               0.721170
18    Neighborhood_SWISU    0.036934               0.758104
24         BldgType_1Fam    0.026428               0.784532
12    Neighborhood_NAmes    0.025647               0.810179
31     HouseStyle_2Story    0.022205               0.832384
7   Neighborhood_Edwards    0.016079               0.84

In [8]:
vif_data=calculate_vif(data.loc[:,top_features])
vif_data.to_csv('vif_MSZoning.csv')
best_features=vif_data[vif_data['VIF']<10]['Feature'].values
best_features

array(['Neighborhood_SWISU', 'Neighborhood_IDOTRR',
       'Neighborhood_SawyerW', 'Neighborhood_OldTown',
       'Neighborhood_Somerst', 'Neighborhood_NAmes', 'LandContour',
       'LotArea', 'BldgType_1Fam', 'OverallQual'], dtype=object)

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,f1_score,make_scorer,mean_absolute_error,mean_squared_error,r2_score
clf = RandomForestClassifier(class_weight='balanced',random_state=42)
f1_macro_scorer = make_scorer(f1_score, average='macro')
f1_weighted_scorer = make_scorer(f1_score, average='weighted')
f1_macro_scores = cross_val_score(clf, data.loc[:,data.columns!='MSZoning'],data.loc[:,'MSZoning'], cv=10, scoring=f1_macro_scorer)
f1_weighted_scores = cross_val_score(clf, data.loc[:,data.columns!='MSZoning'],data.loc[:,'MSZoning'], cv=10, scoring=f1_weighted_scorer)
accuracy_scores = cross_val_score(clf, data.loc[:,data.columns!='MSZoning'],data.loc[:,'MSZoning'], cv=10, scoring='accuracy')
# Calculate the mean of the F1 scores
avg_f1_macro = f1_macro_scores.mean()
avg_f1_weighted = f1_weighted_scores.mean()
avg_accuracy_weighted = accuracy_scores.mean()

print(f"Average Macro F1 Score (10-fold CV): {avg_f1_macro:.4f}")
print(f"Average Weighted F1 Score (10-fold CV): {avg_f1_weighted:.4f}")
print(f"Accuracy (10-fold CV): {avg_accuracy_weighted:.4f}")

Average Macro F1 Score (10-fold CV): 0.6520
Average Weighted F1 Score (10-fold CV): 0.9470
Accuracy (10-fold CV): 0.9527


In [10]:
clf = RandomForestClassifier(class_weight='balanced',random_state=42)
f1_macro_scorer = make_scorer(f1_score, average='macro')
f1_weighted_scorer = make_scorer(f1_score, average='weighted')
f1_macro_scores = cross_val_score(clf, data.loc[:,best_features],data.loc[:,'MSZoning'], cv=10, scoring=f1_macro_scorer)
f1_weighted_scores = cross_val_score(clf, data.loc[:,best_features],data.loc[:,'MSZoning'], cv=10, scoring=f1_weighted_scorer)
accuracy_scores = cross_val_score(clf, data.loc[:,best_features],data.loc[:,'MSZoning'], cv=10, scoring='accuracy')
# Calculate the mean of the F1 scores
avg_f1_macro = f1_macro_scores.mean()
avg_f1_weighted = f1_weighted_scores.mean()
avg_accuracy_weighted = accuracy_scores.mean()

print(f"Average Macro F1 Score (10-fold CV): {avg_f1_macro:.4f}")
print(f"Average Weighted F1 Score (10-fold CV): {avg_f1_weighted:.4f}")
print(f"Accuracy (10-fold CV): {avg_accuracy_weighted:.4f}")

Average Macro F1 Score (10-fold CV): 0.7102
Average Weighted F1 Score (10-fold CV): 0.9369
Accuracy (10-fold CV): 0.9411


In [11]:
X_train,X_valid,y_train,y_valid=train_test_split(data.loc[:,best_features],data.loc[:,'MSZoning'],stratify=data.loc[:,'MSZoning'],test_size=0.2)

rf_classifier = RandomForestClassifier(class_weight='balanced',random_state=42)
rf_classifier.fit(X_train,y_train)
# Make predictions
y_pred = rf_classifier.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print detailed classification report
print(classification_report(y_valid, y_pred))
# Calculate the F1 score (macro and weighted options for multiclass problems)
f1 = f1_score(y_valid, y_pred, average='weighted')  # for multiclass classification
# For binary classification, you can omit the average or use 'binary'

print(f"F1 Score (weighted): {f1:.2f}")

Accuracy: 0.95
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        13
           3       0.85      0.91      0.88        44
           5       0.00      0.00      0.00         3
           6       0.97      0.97      0.97       230
           7       1.00      0.50      0.67         2

    accuracy                           0.95       292
   macro avg       0.76      0.68      0.70       292
weighted avg       0.94      0.95      0.95       292

F1 Score (weighted): 0.95


In [12]:
data=train_data.loc[:,['LotArea','Neighborhood','LotShape','LotConfig','LandContour','BldgType','HouseStyle','MSZoning','LotFrontage']].dropna().reset_index(drop=True)
feature_exp=eda_feature_exp(data.loc[:,['LotArea','Neighborhood','LotShape','LotConfig','LandContour','BldgType','HouseStyle','MSZoning']].columns,'LotFrontage')
data=feature_exp.fit_transform(data)
data['LotFrontage']=np.log1p(data['LotFrontage'])

MSZoning_A
MSZoning_I
MSZoning_RP
{'MSZoning'}
Dropped last category to avoid dummy trap: Neighborhood_Veenker
Dropped last category to avoid dummy trap: LotConfig_FR3
Dropped last category to avoid dummy trap: BldgType_Twnhs
Dropped last category to avoid dummy trap: HouseStyle_SLvl
Dropped last category to avoid dummy trap: MSZoning_RM


In [13]:
from sklearn.ensemble import RandomForestRegressor
kf_2 = KFold(n_splits=10, shuffle=True, random_state=4)
importances = np.zeros(len(data.loc[:,data.columns!='LotFrontage'].columns))
# Perform cross-validation
X=data.loc[:,data.columns!='LotFrontage']
y=data.loc[:,'LotFrontage']
for train_index, test_index in kf_2.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialize and train the Random Forest model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    
    # Accumulate feature importances
    importances += model.feature_importances_

# Average feature importances across folds
importances /= kf_2.get_n_splits()

# Create a DataFrame for the feature importances
feature_importances = pd.DataFrame({'Feature': data.loc[:,data.columns!='LotFrontage'].columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances)

                 Feature  Importance
43               LotArea    0.730124
44              LotShape    0.034472
6      LotConfig_CulDSac    0.031936
5       LotConfig_Corner    0.025045
32         BldgType_1Fam    0.012857
23  Neighborhood_NridgHt    0.012004
25  Neighborhood_OldTown    0.011667
45           LandContour    0.010577
35       BldgType_TwnhsE    0.010288
20    Neighborhood_NAmes    0.008680
39     HouseStyle_2Story    0.008244
15  Neighborhood_Edwards    0.007851
37     HouseStyle_1.5Fin    0.007844
36     HouseStyle_1Story    0.007339
4       LotConfig_Inside    0.007211
16  Neighborhood_Gilbert    0.006278
21  Neighborhood_NoRidge    0.006084
11  Neighborhood_BrkSide    0.005553
3            MSZoning_RL    0.004737
13  Neighborhood_CollgCr    0.004596
14  Neighborhood_Crawfor    0.004123
27   Neighborhood_Sawyer    0.003620
31   Neighborhood_Timber    0.003588
26    Neighborhood_SWISU    0.003515
28  Neighborhood_SawyerW    0.003384
17   Neighborhood_IDOTRR    0.003157
1

In [14]:
feature_importances =get_feature_importance(X,importances)#pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_classifier.feature_importances_}).sort_values(by='Importance', ascending=False)#pd.Series(rf_classifier.feature_importances_, index=X_train.columns).sort_values(ascending=False)
#feature_importances['Cumulative Importance'] = feature_importances['Importance'].cumsum()
feature_importances.to_csv('best_features_LotFrontage.csv')
top_features=get_best_features(feature_importances,0.9)
print(top_features)
print("Top 10 important features:")
print(feature_importances.head(15))

['LotArea', 'LotShape', 'LotConfig_CulDSac', 'LotConfig_Corner', 'BldgType_1Fam', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'LandContour', 'BldgType_TwnhsE', 'Neighborhood_NAmes', 'HouseStyle_2Story', 'Neighborhood_Edwards']
Top 10 important features:
                 Feature  Importance  Cumulative Importance
43               LotArea    0.730124               0.730124
44              LotShape    0.034472               0.764596
6      LotConfig_CulDSac    0.031936               0.796532
5       LotConfig_Corner    0.025045               0.821577
32         BldgType_1Fam    0.012857               0.834434
23  Neighborhood_NridgHt    0.012004               0.846438
25  Neighborhood_OldTown    0.011667               0.858105
45           LandContour    0.010577               0.868681
35       BldgType_TwnhsE    0.010288               0.878969
20    Neighborhood_NAmes    0.008680               0.887650
39     HouseStyle_2Story    0.008244               0.895893
15  Neighborhood_Edwar

In [15]:
vif_data=calculate_vif(data.loc[:,top_features])
vif_data.to_csv('vif_LotFrontage.csv')
best_features=vif_data[vif_data['VIF']<10]['Feature'].values
best_features

array(['BldgType_TwnhsE', 'Neighborhood_Edwards', 'Neighborhood_NridgHt',
       'Neighborhood_OldTown', 'LotConfig_CulDSac', 'LandContour',
       'LotConfig_Corner', 'Neighborhood_NAmes', 'HouseStyle_2Story',
       'LotShape', 'LotArea', 'BldgType_1Fam'], dtype=object)

In [16]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [17]:
rf_regressor_cv = RandomForestRegressor()
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
mae_scores = cross_val_score(rf_regressor_cv,data.loc[:,data.columns!='LotFrontage'],data.loc[:,'LotFrontage'], cv=10, scoring=mae_scorer)
cv_mse_scores = cross_val_score(rf_regressor_cv, data.loc[:,data.columns!='LotFrontage'],data.loc[:,'LotFrontage'], cv=10, scoring=mse_scorer)
cv_r2_scores = cross_val_score(rf_regressor_cv, data.loc[:,data.columns!='LotFrontage'],data.loc[:,'LotFrontage'], cv=10, scoring='r2')
cv_mape_scores = cross_val_score(rf_regressor_cv, data.loc[:,data.columns!='LotFrontage'],data.loc[:,'LotFrontage'], cv=10, scoring=mape_scorer)

cv_mae_scores = -mae_scores
mse_scores = -cv_mse_scores
rmse_scores = np.sqrt(mse_scores)
mape_scores = -cv_mape_scores

print(f"Average MAE: {cv_mae_scores.mean():.2f}")
print(f"Average MSE: {mse_scores.mean():.2f}")
print(f"Average RMSE: {rmse_scores.mean():.2f}")
print(f"Average R^2: {cv_r2_scores.mean():.2f}")
print(f"Average MAPE: {mape_scores.mean():.2f}%")

Average MAE: 0.12
Average MSE: 0.04
Average RMSE: 0.19
Average R^2: 0.71
Average MAPE: 2.77%


In [18]:
rf_regressor_cv = RandomForestRegressor()
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
mae_scores = cross_val_score(rf_regressor_cv,data.loc[:,best_features],data.loc[:,'LotFrontage'], cv=10, scoring=mae_scorer)
cv_mse_scores = cross_val_score(rf_regressor_cv, data.loc[:,best_features],data.loc[:,'LotFrontage'], cv=10, scoring=mse_scorer)
cv_r2_scores = cross_val_score(rf_regressor_cv, data.loc[:,best_features],data.loc[:,'LotFrontage'], cv=10, scoring='r2')
cv_mape_scores = cross_val_score(rf_regressor_cv, data.loc[:,best_features],data.loc[:,'LotFrontage'], cv=10, scoring=mape_scorer)

cv_mae_scores = -mae_scores
mse_scores = -cv_mse_scores
rmse_scores = np.sqrt(mse_scores)
mape_scores = -cv_mape_scores

print(f"Average MAE: {cv_mae_scores.mean():.2f}")
print(f"Average MSE: {mse_scores.mean():.2f}")
print(f"Average RMSE: {rmse_scores.mean():.2f}")
print(f"Average R^2: {cv_r2_scores.mean():.2f}")
print(f"Average MAPE: {mape_scores.mean():.2f}%")

Average MAE: 0.12
Average MSE: 0.04
Average RMSE: 0.20
Average R^2: 0.67
Average MAPE: 2.93%
