

<div align="center">
  <h1>House Price Prediction</h1>
</div>

In [5]:
import pandas as p
p.set_option('display.max_columns', 100)
p.set_option('display.max_rows', 100)

# Load the training and test datasets
train_df = p.read_csv('housetrain_data.csv')
test_df = p.read_csv('housetest_data.csv')

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)


Training data shape: (1600, 10)
Test data shape: (400, 10)


In [6]:
htrain = train_df
htest = test_df

In [22]:
htrain.shape, htest.shape
# Display column names to identify the target column
print("Columns in training data:")
print(htrain.columns.tolist())
htrain.head()


Columns in training data:
['Id', 'Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'Location', 'Condition', 'Garage', 'Price']


Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,969,4483,4,4,3,1933,Urban,Excellent,No,434325
1,241,1062,3,3,1,1970,Downtown,Good,No,614772
2,820,1422,3,4,1,1993,Urban,Good,Yes,922811
3,693,2658,2,3,1,1972,Rural,Poor,Yes,794314
4,421,3286,2,4,1,1981,Rural,Excellent,Yes,796988


In [8]:
null_value_cols = [col for col in htrain.columns if htrain[col].isnull().any()]

In [9]:
htrain[null_value_cols].isnull().sum()

Series([], dtype: float64)

In [10]:
htrain = htrain.drop(columns=['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], errors='ignore')

In [11]:
htest = htest.drop(columns=['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], errors='ignore')

In [12]:
htrain[null_value_cols].isnull().sum()

Series([], dtype: float64)

In [13]:
htrain[null_value_cols].dtypes
htrain = htrain.drop(columns='LotFrontage', errors='ignore')

In [14]:
fresh_housing_train = htrain.dropna()
fresh_housing_train.shape

(1600, 10)

In [15]:
low_cardinality_col = [col for col in fresh_housing_train.columns if
fresh_housing_train[col].dtype == object and
fresh_housing_train[col].nunique() < 10]
high_cardinality_col = [col for col in fresh_housing_train.columns if
fresh_housing_train[col].dtype == object and
fresh_housing_train[col].nunique() >= 10]
num_col = [col for col in fresh_housing_train.columns if
fresh_housing_train[col].dtype in [int, float]]

In [16]:
new_housing_train = p.concat([fresh_housing_train[low_cardinality_col],
fresh_housing_train[high_cardinality_col],
fresh_housing_train[num_col]], axis=1)

In [17]:
new_housing_train = new_housing_train.drop(columns='Id', errors='ignore')
htest = htest.drop(columns=['LotFrontage', 'Id'], errors='ignore')

In [18]:
null_value_cols = [col for col in htest.columns if htest[col].isnull().any()]
htest[htest.isnull().any(axis=1)][null_value_cols]  # Prints all the null value columns

fresh_housing_test = htest.dropna()
low_cardinality_col = [col for col in fresh_housing_test.columns if
	fresh_housing_test[col].dtype == object and
	fresh_housing_test[col].nunique() < 10]
high_cardinality_col = [col for col in fresh_housing_test.columns if
	fresh_housing_test[col].dtype == object and
	fresh_housing_test[col].nunique() >= 10]
num_col = [col for col in fresh_housing_test.columns if
	fresh_housing_test[col].dtype in [int, float]]
new_housing_test = p.concat([fresh_housing_test[low_cardinality_col],
							 fresh_housing_test[high_cardinality_col],
							 fresh_housing_test[num_col]], axis=1)

In [19]:
copy_of_housing_train = new_housing_train.copy()
copy_of_housing_test = new_housing_test.copy()

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_col = low_cardinality_col + high_cardinality_col
for col in cat_col:
	new_housing_train[col] = le.fit_transform(new_housing_train[col])
for col in cat_col:
	new_housing_test[col] = le.fit_transform(new_housing_test[col])

In [23]:
from sklearn.model_selection import train_test_split

# Identify the target column (usually the last column or 'SalePrice')
target_col = new_housing_train.columns[-1]  # Use the last column as target
print(f"Target column: {target_col}")

y = new_housing_train[target_col]
X = new_housing_train.drop(columns=target_col)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=766)
X_train.shape, X_test.shape


Target column: Price


((1280, 8), (320, 8))

In [24]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
dtr.fit(X_train, y_train)
dtr_prediction = dtr.predict(X_test)
rfr.fit(X_train, y_train)
rfr_prediction = rfr.predict(X_test)

In [25]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
dtr_error = mean_absolute_error(y_test, dtr_prediction)
dtr_error_percentage = mean_absolute_percentage_error(y_test,
dtr_prediction)
dtr_error, dtr_error_percentage*100 #(27336.078358208953, 14.489400761489787)

(339658.828125, 107.823965012127)

In [26]:
rfr_error = mean_absolute_error(y_test, rfr_prediction)
rfr_error_percentage = mean_absolute_percentage_error(y_test,rfr_prediction)
rfr_error, rfr_error_percentage*100 # (18731.851417910446, 10.288181221800267)

(265588.15499999997, 95.04680120983198)