In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Data Modeling
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Data Visualization
from scipy.stats import iqr
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_data=pd.read_csv('train.csv',keep_default_na=False,na_values=[" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NULL", "NaN", "n/a", "nan", "null","NA"])
default_NA={'Alley':'NA','BsmtQual':'NA','BsmtCond':'NA','BsmtExposure':'NA',
            'BsmtFinType1':'NA','BsmtFinType2':'NA','FireplaceQu':'NA',
            'GarageType':'NA','GarageFinish':'NA',
            'GarageQual':'NA','GarageCond':'NA','PoolQC':'NA','Fence':'NA','MiscFeature':'NA'}
train_data=train_data.fillna(default_NA)

In [3]:
nulls = train_data.isnull().sum().to_frame()
for index, row in nulls.iterrows():
    if row[0] > 0:
        print(index, row[0])
    else:
        continue

LotFrontage 259
MasVnrType 8
MasVnrArea 8
Electrical 1
GarageYrBlt 81


In [4]:
from eda_imputer_selection import eda_feature_exp
data=train_data.loc[:,['Neighborhood','LotArea','LandContour','BldgType','HouseStyle','OverallQual','MSZoning']]
feature_exp=eda_feature_exp(data.loc[:,['Neighborhood','LotArea','LandContour','BldgType','HouseStyle','OverallQual']].columns,'MSZoning')
data=feature_exp.fit_transform(data)

set()
Dropped last category to avoid dummy trap: Neighborhood_Veenker
Dropped last category to avoid dummy trap: BldgType_Twnhs
Dropped last category to avoid dummy trap: HouseStyle_SLvl
{'RP': 0, 'RH': 1, 'RM': 2, 'RL': 3, 'C (all)': 4, 'A': 5, 'FV': 6, 'I': 7}


In [5]:
from sklearn.ensemble import RandomForestClassifier
X_train,X_valid,y_train,y_valid=train_test_split(data.loc[:,data.columns!='MSZoning'],data.loc[:,'MSZoning'],stratify=data.loc[:,'MSZoning'],test_size=0.2,random_state=42)

rf_classifier = RandomForestClassifier(class_weight='balanced',n_estimators=500,random_state=42)
rf_classifier.fit(X_train,y_train)
print(rf_classifier.get_params())
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,f1_score,make_scorer,mean_absolute_error,mean_squared_error
# Make predictions
y_pred = rf_classifier.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print detailed classification report
print(classification_report(y_valid, y_pred))
# Calculate the F1 score (macro and weighted options for multiclass problems)
f1 = f1_score(y_valid, y_pred, average='weighted')  # for multiclass classification
# For binary classification, you can omit the average or use 'binary'

print(f"F1 Score (weighted): {f1:.2f}")

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Accuracy: 0.95
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         3
           2       0.90      0.86      0.88        44
           3       0.95      0.99      0.97       230
           4       1.00      0.50      0.67         2
           6       1.00      0.85      0.92        13

    accuracy                           0.95       292
   macro avg       0.77      0.64      0.69       292
weighted avg       0.94      0.95      0.94       292

F1 Score (weighted): 0.94


In [6]:
cm = confusion_matrix(y_valid, y_pred)
cm

array([[  0,   0,   3,   0,   0],
       [  0,  38,   6,   0,   0],
       [  0,   3, 227,   0,   0],
       [  0,   1,   0,   1,   0],
       [  0,   0,   2,   0,  11]], dtype=int64)

In [7]:
from feature_selection import get_feature_importance,get_best_features,calculate_vif
feature_importances =get_feature_importance(X_train,rf_classifier.feature_importances_)#pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_classifier.feature_importances_}).sort_values(by='Importance', ascending=False)#pd.Series(rf_classifier.feature_importances_, index=X_train.columns).sort_values(ascending=False)
#feature_importances['Cumulative Importance'] = feature_importances['Importance'].cumsum()
feature_importances.to_csv('best_features.csv')
top_features=get_best_features(feature_importances,0.8)
print(top_features)
print("Top 10 important features:")
print(feature_importances.head(15))

['Neighborhood_Somerst', 'LotArea', 'OverallQual', 'Neighborhood_IDOTRR', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'LandContour', 'Neighborhood_SawyerW', 'Neighborhood_NAmes', 'HouseStyle_2Story']
Top 10 important features:
                 Feature  Importance  Cumulative Importance
21  Neighborhood_Somerst    0.171548               0.171548
35               LotArea    0.169157               0.340705
37           OverallQual    0.121390               0.462095
9    Neighborhood_IDOTRR    0.109340               0.571435
17  Neighborhood_OldTown    0.072961               0.644396
18    Neighborhood_SWISU    0.037916               0.682312
36           LandContour    0.034509               0.716820
20  Neighborhood_SawyerW    0.029953               0.746774
12    Neighborhood_NAmes    0.028101               0.774875
31     HouseStyle_2Story    0.027138               0.802013
24         BldgType_1Fam    0.025034               0.827047
29     HouseStyle_1.5Fin    0.017759               

In [8]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# vif_data = pd.DataFrame()
# vif_data['Feature'] = data.loc[:,data.columns!='MSZoning'].columns
# vif_data['VIF'] = [variance_inflation_factor(data.loc[:,data.columns!='MSZoning'].values, i) for i in range(data.loc[:,data.columns!='MSZoning'].shape[1])]
# vif_data=vif_data.sort_values(by='VIF', ascending=True)
best_features=['Neighborhood_Somerst','LotArea','OverallQual','Neighborhood_IDOTRR','Neighborhood_OldTown','Neighborhood_SawyerW','Neighborhood_SWISU','LandContour','BldgType_1Fam','Neighborhood_NAmes']
vif_data=calculate_vif(data.loc[:,top_features])
vif_data.to_csv('vif.csv')

In [9]:
best_features=vif_data[vif_data['VIF']<10]['Feature'].values
best_features

array(['Neighborhood_SWISU', 'Neighborhood_IDOTRR',
       'Neighborhood_SawyerW', 'Neighborhood_OldTown',
       'Neighborhood_Somerst', 'Neighborhood_NAmes', 'LandContour',
       'HouseStyle_2Story', 'LotArea', 'OverallQual'], dtype=object)

In [10]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(class_weight='balanced',random_state=42)
f1_macro_scorer = make_scorer(f1_score, average='macro')
f1_weighted_scorer = make_scorer(f1_score, average='weighted')
f1_macro_scores = cross_val_score(clf, data.loc[:,data.columns!='MSZoning'],data.loc[:,'MSZoning'], cv=10, scoring=f1_macro_scorer)
f1_weighted_scores = cross_val_score(clf, data.loc[:,data.columns!='MSZoning'],data.loc[:,'MSZoning'], cv=10, scoring=f1_weighted_scorer)
accuracy_scores = cross_val_score(clf, data.loc[:,data.columns!='MSZoning'],data.loc[:,'MSZoning'], cv=10, scoring='accuracy')
# Calculate the mean of the F1 scores
avg_f1_macro = f1_macro_scores.mean()
avg_f1_weighted = f1_weighted_scores.mean()
avg_accuracy_weighted = accuracy_scores.mean()

print(f"Average Macro F1 Score (10-fold CV): {avg_f1_macro:.4f}")
print(f"Average Weighted F1 Score (10-fold CV): {avg_f1_weighted:.4f}")
print(f"Accuracy (10-fold CV): {avg_accuracy_weighted:.4f}")

Average Macro F1 Score (10-fold CV): 0.6515
Average Weighted F1 Score (10-fold CV): 0.9463
Accuracy (10-fold CV): 0.9521


In [11]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(class_weight='balanced',random_state=42)
f1_macro_scorer = make_scorer(f1_score, average='macro')
f1_weighted_scorer = make_scorer(f1_score, average='weighted')
f1_macro_scores = cross_val_score(clf, data.loc[:,best_features],data.loc[:,'MSZoning'], cv=10, scoring=f1_macro_scorer)
f1_weighted_scores = cross_val_score(clf, data.loc[:,best_features],data.loc[:,'MSZoning'], cv=10, scoring=f1_weighted_scorer)
accuracy_scores = cross_val_score(clf, data.loc[:,best_features],data.loc[:,'MSZoning'], cv=10, scoring='accuracy')
# Calculate the mean of the F1 scores
avg_f1_macro = f1_macro_scores.mean()
avg_f1_weighted = f1_weighted_scores.mean()
avg_accuracy_weighted = accuracy_scores.mean()

print(f"Average Macro F1 Score (10-fold CV): {avg_f1_macro:.4f}")
print(f"Average Weighted F1 Score (10-fold CV): {avg_f1_weighted:.4f}")
print(f"Accuracy (10-fold CV): {avg_accuracy_weighted:.4f}")

Average Macro F1 Score (10-fold CV): 0.6614
Average Weighted F1 Score (10-fold CV): 0.9356
Accuracy (10-fold CV): 0.9404


In [12]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(class_weight='balanced',random_state=42)
f1_macro_scorer = make_scorer(f1_score, average='macro')
f1_weighted_scorer = make_scorer(f1_score, average='weighted')
f1_macro_scores = cross_val_score(clf, data.loc[:,['Neighborhood_Somerst','LotArea','Neighborhood_IDOTRR','Neighborhood_OldTown','Neighborhood_SWISU','LandContour','Neighborhood_SawyerW','HouseStyle_2Story','BldgType_1Fam']],data.loc[:,'MSZoning'], cv=10, scoring=f1_macro_scorer)
f1_weighted_scores = cross_val_score(clf, data.loc[:,['Neighborhood_Somerst','LotArea','Neighborhood_IDOTRR','Neighborhood_OldTown','Neighborhood_SWISU','LandContour','Neighborhood_SawyerW','HouseStyle_2Story','BldgType_1Fam']],data.loc[:,'MSZoning'], cv=10, scoring=f1_weighted_scorer)
accuracy_scores = cross_val_score(clf, data.loc[:,['Neighborhood_Somerst','LotArea','Neighborhood_IDOTRR','Neighborhood_OldTown','Neighborhood_SWISU','LandContour','Neighborhood_SawyerW','HouseStyle_2Story','BldgType_1Fam']],data.loc[:,'MSZoning'], cv=10, scoring='accuracy')
# Calculate the mean of the F1 scores
avg_f1_macro = f1_macro_scores.mean()
avg_f1_weighted = f1_weighted_scores.mean()
avg_accuracy_weighted = accuracy_scores.mean()

print(f"Average Macro F1 Score (10-fold CV): {avg_f1_macro:.4f}")
print(f"Average Weighted F1 Score (10-fold CV): {avg_f1_weighted:.4f}")
print(f"Accuracy (10-fold CV): {avg_accuracy_weighted:.4f}")

Average Macro F1 Score (10-fold CV): 0.6402
Average Weighted F1 Score (10-fold CV): 0.9320
Accuracy (10-fold CV): 0.9342


In [13]:
X_train,X_valid,y_train,y_valid=train_test_split(data.loc[:,['Neighborhood_Somerst','LotArea','Neighborhood_IDOTRR','Neighborhood_OldTown','Neighborhood_SWISU','LandContour','Neighborhood_SawyerW','HouseStyle_2Story','BldgType_1Fam']],data.loc[:,'MSZoning'],stratify=data.loc[:,'MSZoning'],test_size=0.2)

rf_classifier = RandomForestClassifier(class_weight='balanced',random_state=42)
rf_classifier.fit(X_train,y_train)
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,f1_score,make_scorer,mean_absolute_error,mean_squared_error
# Make predictions
y_pred = rf_classifier.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print detailed classification report
print(classification_report(y_valid, y_pred))
# Calculate the F1 score (macro and weighted options for multiclass problems)
f1 = f1_score(y_valid, y_pred, average='weighted')  # for multiclass classification
# For binary classification, you can omit the average or use 'binary'

print(f"F1 Score (weighted): {f1:.2f}")

Accuracy: 0.95
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         3
           2       0.90      0.80      0.84        44
           3       0.95      0.99      0.97       230
           4       0.50      0.50      0.50         2
           6       1.00      1.00      1.00        13

    accuracy                           0.95       292
   macro avg       0.67      0.66      0.66       292
weighted avg       0.93      0.95      0.94       292

F1 Score (weighted): 0.94


In [14]:
X_train,X_valid,y_train,y_valid=train_test_split(data.loc[:,best_features],data.loc[:,'MSZoning'],stratify=data.loc[:,'MSZoning'],test_size=0.2)

rf_classifier = RandomForestClassifier(class_weight='balanced',random_state=42)
rf_classifier.fit(X_train,y_train)
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,f1_score,make_scorer,mean_absolute_error,mean_squared_error
# Make predictions
y_pred = rf_classifier.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print detailed classification report
print(classification_report(y_valid, y_pred))
# Calculate the F1 score (macro and weighted options for multiclass problems)
f1 = f1_score(y_valid, y_pred, average='weighted')  # for multiclass classification
# For binary classification, you can omit the average or use 'binary'

print(f"F1 Score (weighted): {f1:.2f}")

Accuracy: 0.93
              precision    recall  f1-score   support

           1       1.00      0.33      0.50         3
           2       0.83      0.89      0.86        44
           3       0.97      0.96      0.97       230
           4       0.00      0.00      0.00         2
           6       0.80      0.92      0.86        13

    accuracy                           0.93       292
   macro avg       0.72      0.62      0.64       292
weighted avg       0.94      0.93      0.93       292

F1 Score (weighted): 0.93


In [15]:
data=train_data.loc[:,['LotArea','Neighborhood','LotShape','LotConfig','BldgType','LotFrontage']].dropna().reset_index(drop=True)
feature_exp=eda_feature_exp(data.loc[:,['LotArea','Neighborhood','LotShape','LotConfig','BldgType']].columns,'LotFrontage')
data=feature_exp.fit_transform(data)
data['LotFrontage']=np.log1p(data['LotFrontage'])

set()
Dropped last category to avoid dummy trap: Neighborhood_Veenker
Dropped last category to avoid dummy trap: LotConfig_FR3
Dropped last category to avoid dummy trap: BldgType_Twnhs


In [16]:
len(data)

1201

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
X_train,X_valid,y_train,y_valid=train_test_split(data.loc[:,data.columns!='LotFrontage'],data.loc[:,'LotFrontage'],test_size=0.2,random_state=42)
rf_regressor=RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train,y_train)
y_pred = rf_regressor.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
print(f"MAE: {mae:.2f}")
mse = mean_squared_error(y_valid, y_pred)
print(f"MSE: {mse:.2f}")
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print(f"RMSE: {rmse:.2f}")
r2 = r2_score(y_valid, y_pred)
print(f"R²: {r2:.2f}")
mape = (abs((y_valid - y_pred) / y_valid)).mean() * 100
print(f"MAPE: {mape:.2f}%")

MAE: 0.12
MSE: 0.04
RMSE: 0.19
R²: 0.69
MAPE: 2.94%


In [18]:
feature_importances =get_feature_importance(X_train,rf_regressor.feature_importances_)
top_features=get_best_features(feature_importances,0.9)
print(top_features)
feature_importances

['LotArea', 'LotShape', 'LotConfig_CulDSac', 'LotConfig_Corner', 'BldgType_1Fam', 'Neighborhood_OldTown', 'Neighborhood_NridgHt', 'Neighborhood_Edwards', 'LotConfig_Inside']


Unnamed: 0,Feature,Importance,Cumulative Importance
32,LotArea,0.749348,0.749348
33,LotShape,0.03798,0.787328
2,LotConfig_CulDSac,0.031455,0.818783
1,LotConfig_Corner,0.026014,0.844797
28,BldgType_1Fam,0.019256,0.864053
21,Neighborhood_OldTown,0.014222,0.878275
19,Neighborhood_NridgHt,0.011245,0.88952
11,Neighborhood_Edwards,0.009679,0.899198
0,LotConfig_Inside,0.00871,0.907908
31,BldgType_TwnhsE,0.008373,0.916281


In [19]:
vif_data=calculate_vif(data.loc[:,top_features])
vif_data.to_csv('vif_2.csv')