## Importing Libraries and Data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
import os
import warnings
import numpy as np 
import pandas as pd 
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

print(os.listdir("/kaggle/input"))
%matplotlib inline

['house-prices-advanced-regression-techniques']


In [3]:
train  = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test   = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
sample = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

## Data Analysis

In [4]:
#can be used if we import pandas_profiling as pp
#pp.ProfileReport(train)

In [5]:
#Function for viewing data with max 1000 columns and rows
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [6]:
#Seprate taget value "Id" "Saleprice(more importantly)" from the features
y     = train[['Id','SalePrice']]
#After storing the target value "SalePrice" in y we can now drop it from the feature set 
train = train.drop('SalePrice',axis=1)

In [7]:
#creates a list with both Training and Testing Data sets -> all_dfs
all_dfs = [train,test]
# "d.concat(all_dfs)" merges the train and test datasets into a single DataFrame called all_df. It stacks them vertically by default since they share the same columns.
# ".reset_index(drop=True)" resets the index of the new concatenated DataFrame and drops the old index. 
# Without this, the concatenated DataFrame would retain the indices from both train and test
all_df = pd.concat(all_dfs).reset_index(drop=True);

In [8]:
display_all((all_df.isnull().sum()/all_df.shape[0])*100)

Id                0.000000
MSSubClass        0.000000
MSZoning          0.137033
LotFrontage      16.649538
LotArea           0.000000
Street            0.000000
Alley            93.216855
LotShape          0.000000
LandContour       0.000000
Utilities         0.068517
LotConfig         0.000000
LandSlope         0.000000
Neighborhood      0.000000
Condition1        0.000000
Condition2        0.000000
BldgType          0.000000
HouseStyle        0.000000
OverallQual       0.000000
OverallCond       0.000000
YearBuilt         0.000000
YearRemodAdd      0.000000
RoofStyle         0.000000
RoofMatl          0.000000
Exterior1st       0.034258
Exterior2nd       0.034258
MasVnrType       60.500171
MasVnrArea        0.787941
ExterQual         0.000000
ExterCond         0.000000
Foundation        0.000000
BsmtQual          2.774923
BsmtCond          2.809181
BsmtExposure      2.809181
BsmtFinType1      2.706406
BsmtFinSF1        0.034258
BsmtFinType2      2.740665
BsmtFinSF2        0.034258
B

## Handling Data

In [9]:
# Drop all the features with very high number in null rows 
all_df.drop(['Alley','PoolQC','MiscFeature','Fence','FireplaceQu','Utilities'],axis=1,inplace=True)

### Importance of Filling Missing Values:
* **Model Performance**: Many machine learning algorithms do not handle missing values well, and filling them in can help improve model accuracy.
* **Data Integrity**: Ensuring that all necessary fields are filled helps maintain the integrity of the dataset, allowing for a more comprehensive analysis.
* **Bias Mitigation**: Using medians for numerical values can help reduce bias in your model, particularly in the presence of outliers.

In [10]:
# with all feature with few null views, we manually filled the missing data
all_df['LotFrontage'].fillna(value=all_df['LotFrontage'].median(),inplace=True)
all_df['MasVnrType'].fillna(value='None',inplace=True)
all_df['MasVnrArea'].fillna(0,inplace=True)
all_df['BsmtCond'].fillna(value='TA',inplace=True)
all_df['BsmtExposure'].fillna(value='No',inplace=True)
all_df['Electrical'].fillna(value='SBrkr',inplace=True)
all_df['BsmtFinType2'].fillna(value='Unf',inplace=True)
all_df['GarageType'].fillna(value='Attchd',inplace=True)
all_df['GarageYrBlt'].fillna(value=all_df['GarageYrBlt'].median(),inplace=True)
all_df['GarageFinish'].fillna(value='Unf',inplace=True)
all_df['GarageQual'].fillna(value='TA',inplace=True)
all_df['GarageCond'].fillna(value='TA',inplace=True)
all_df['BsmtFinType1'].fillna(value='NO',inplace=True)
all_df['BsmtQual'].fillna(value='No',inplace=True)
all_df['BsmtFullBath'].fillna(value=all_df['BsmtFullBath'].median(),inplace=True)
all_df['BsmtFinSF1'].fillna(value=all_df['BsmtFinSF1'].median(),inplace=True)
all_df['BsmtFinSF2'].fillna(value=0,inplace=True)
all_df['BsmtUnfSF'].fillna(value=0,inplace=True)
all_df['TotalBsmtSF'].fillna(value=all_df['TotalBsmtSF'].median(),inplace=True)
all_df['BsmtHalfBath'].fillna(value=0,inplace=True)
all_df['GarageCars'].fillna(value=all_df['GarageCars'].median(),inplace=True)
all_df['GarageArea'].fillna(value=all_df['GarageArea'].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df['LotFrontage'].fillna(value=all_df['LotFrontage'].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df['MasVnrType'].fillna(value='None',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedi

In [11]:
#Using labelencoder from Ssklearn to convert "categorical features" to to a numerical format
#This step is essential before feeding the data into Machine Learning models since they work better with numerical input
labelencoder=LabelEncoder()

all_df['MSZoning']      = labelencoder.fit_transform(all_df['MSZoning'].astype(str))
all_df['Exterior1st']   = labelencoder.fit_transform(all_df['Exterior1st'].astype(str))
all_df['Exterior2nd']   = labelencoder.fit_transform(all_df['Exterior2nd'].astype(str))
all_df['KitchenQual']   = labelencoder.fit_transform(all_df['KitchenQual'].astype(str))
all_df['Functional']    = labelencoder.fit_transform(all_df['Functional'].astype(str))
all_df['SaleType']      = labelencoder.fit_transform(all_df['SaleType'].astype(str))
all_df['Street']        = labelencoder.fit_transform(all_df['Street'])   
all_df['LotShape']      = labelencoder.fit_transform(all_df['LotShape'])   
all_df['LandContour']   = labelencoder.fit_transform(all_df['LandContour'])   
all_df['LotConfig']     = labelencoder.fit_transform(all_df['LotConfig'])   
all_df['LandSlope']     = labelencoder.fit_transform(all_df['LandSlope'])   
all_df['Neighborhood']  = labelencoder.fit_transform(all_df['Neighborhood'])   
all_df['Condition1']    = labelencoder.fit_transform(all_df['Condition1'])   
all_df['Condition2']    = labelencoder.fit_transform(all_df['Condition2'])   
all_df['BldgType']      = labelencoder.fit_transform(all_df['BldgType'])   
all_df['HouseStyle']    = labelencoder.fit_transform(all_df['HouseStyle'])   
all_df['RoofStyle']     = labelencoder.fit_transform(all_df['RoofStyle'])   
all_df['RoofMatl']      = labelencoder.fit_transform(all_df['RoofMatl'])    
all_df['MasVnrType']    = labelencoder.fit_transform(all_df['MasVnrType'])   
all_df['ExterQual']     = labelencoder.fit_transform(all_df['ExterQual'])  
all_df['ExterCond']     = labelencoder.fit_transform(all_df['ExterCond'])   
all_df['Foundation']    = labelencoder.fit_transform(all_df['Foundation'])   
all_df['BsmtQual']      = labelencoder.fit_transform(all_df['BsmtQual'])   
all_df['BsmtCond']      = labelencoder.fit_transform(all_df['BsmtCond'])   
all_df['BsmtExposure']  = labelencoder.fit_transform(all_df['BsmtExposure'])   
all_df['BsmtFinType1']  = labelencoder.fit_transform(all_df['BsmtFinType1'])   
all_df['BsmtFinType2']  = labelencoder.fit_transform(all_df['BsmtFinType2'])   
all_df['Heating']       = labelencoder.fit_transform(all_df['Heating'])   
all_df['HeatingQC']     = labelencoder.fit_transform(all_df['HeatingQC'])   
all_df['CentralAir']    = labelencoder.fit_transform(all_df['CentralAir'])   
all_df['Electrical']    = labelencoder.fit_transform(all_df['Electrical'])    
all_df['GarageType']    = labelencoder.fit_transform(all_df['GarageType'])  
all_df['GarageFinish']  = labelencoder.fit_transform(all_df['GarageFinish'])   
all_df['GarageQual']    = labelencoder.fit_transform(all_df['GarageQual'])  
all_df['GarageCond']    = labelencoder.fit_transform(all_df['GarageCond'])   
all_df['PavedDrive']    = labelencoder.fit_transform(all_df['PavedDrive'])  
all_df['SaleCondition'] = labelencoder.fit_transform(all_df['SaleCondition'])  

### **Feature scaling:** 
is a preprocessing step in machine learning that adjusts the range of values for features (input variables) in your dataset. The goal is to ensure that the features contribute equally to the model training process by bringing them to a comparable scale. This is important because many machine learning algorithms (e.g., gradient-based methods like logistic regression, neural networks, and support vector machines) are sensitive to the scale of input data.

**Here we are using Standardization:**
It transforms the data such that its mean is 0 and its standard deviation is 1.

In [12]:
#scales features by removing the mean
Scaler = StandardScaler()
# ".fit_transform()" calculates the mean and standard deviation for scaling and then applies the transformation to the entire dataset (all_df)
all_scaled   = pd.DataFrame(Scaler.fit_transform(all_df))
#spliting th data into 50/50 :1460 -> firt 1460, 1460:2920 -> second 1460.
#as the training set consists of the first 1460 rows (train_scaled), while the test set is the next 1460 rows (test_scaled)
#train_scaled: This represents the feature set after feature scaling, which contains all the predictors (i.e., features)
train_scaled = pd.DataFrame(all_scaled[:1460])
test_scaled  = pd.DataFrame(all_scaled[1460:2920])

In [13]:
# X is the feature set
# "train_test_split" is a Sklearn function to splits datasets into training and testing sets
# "y['SalePrice']" is the target variable it was previously separated into the y DataFrame.
# "test_size=0.1" -> 10% of the data will be used as a test set, the other 90% for training
# X_train: 90% of the features, used for training your model.
# X_test: 10% of the features, used for testing your model's performance.
# y_train: 90% of the target values corresponding to X_train.
# y_test: 10% of the target values corresponding to X_test.
# x -> input (Features), y -> output ("SalePrice")

X = train_scaled
X_train, X_test, y_train, y_test = train_test_split(X, y['SalePrice'], test_size=0.1, random_state=42)

## XGBoost

In [14]:
from xgboost import XGBRegressor
XGB = XGBRegressor(
    max_depth=2,       #maximum depth tree (Limits the depth of each decision tree to prevent overfitting)
    learning_rate=0.1, #step size to update the weight after each boosting step (Controls how much the model adjusts during each boosting step)
    n_estimators=1000, #number of trees or "boosting rounds"
    reg_alpha=0.001,   #regulization term on weights
    reg_lambda=0.000001,
    n_jobs=-1,         #uses all availible CPU 
    min_child_weight=3 #minimum sum of instance weights required in a child note
)                      #^^(A higher value makes the model more conservative by preventing the tree from growing too complex )
XGB.fit(X_train,y_train)

## LGBM
Using both LGBM and XGB in your modeling pipeline can be beneficial for comparison purposes, but it is not necessary. However here we are going to use the prediction from both models and combine them 

In [15]:
from lightgbm import LGBMRegressor
#initialize model
#n_estimators=1000 specifies that the model will build 1000 boosting iterations or "trees"
LGBM = LGBMRegressor(n_estimators = 1000)
LGBM.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004947 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3497
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 71
[LightGBM] [Info] Start training from score 180704.734399


## Test Scores

In [16]:
#Test score for both models 
print ("Training score:",XGB.score(X_train,y_train), "Test Score:",XGB.score(X_test,y_test))
print ("Training score:",LGBM.score(X_train,y_train),"Test Score:",LGBM.score(X_test,y_test))

Training score: 0.9871212797046796 Test Score: 0.920257100589606
Training score: 0.9999557727264998 Test Score: 0.9101390980189892


## Submission

In [17]:
#generate the predictions using ".predict" for the "test_scaled" for both models and store it to merge later
y_pred_xgb  = pd.DataFrame( XGB.predict(test_scaled))
y_pred_lgbm = pd.DataFrame(LGBM.predict(test_scaled))
#create empty dataset set to store the result (with the same format as the sample_submission)
y_pred=pd.DataFrame()
#store the "SalePrice" prediction from both XGB and LGBM 
#the SalePrice was combines by dividng each value from each model half and then added together
# so if for Id 1 the SalePrice prediction from XGB was 98 and LGBM was 102 that would be 100 ((98/2)+(102/2))=100 
y_pred['SalePrice'] = 0.5 * y_pred_xgb[0] + 0.5 * y_pred_lgbm[0]
#store the Id column in the dataset
y_pred['Id'] = test['Id']

In [18]:
#save dataset into a csv file for submission
y_pred.to_csv('submission.csv',index=False)