In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler ,StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train = pd.read_csv('.\\datasets\\train.csv')
test = pd.read_csv('.\\datasets\\test.csv')

In [3]:
train = train.drop(['Id','YearRemodAdd','GarageYrBlt','MoSold','YrSold'], axis=1)
test = test.drop(['YearRemodAdd','GarageYrBlt','MoSold','YrSold'], axis=1)
ids = test.pop('Id')

In [4]:
train['YearBuilt'] = 2023 - train['YearBuilt']
test['YearBuilt'] = 2023 - test['YearBuilt'] # now it becomes age of the house

In [5]:
numeric_features = train.select_dtypes(include=[np.number]).columns
categorical_features = train.select_dtypes(include=[object]).columns
numeric_features_1 = test.select_dtypes(include=[np.number]).columns
categorical_features_1 = test.select_dtypes(include=[object]).columns


In [6]:
ab = train.drop('SalePrice',axis=1)
numeric_features_scale = ab.select_dtypes(include=[np.number]).columns

# fill NaN values or drop them

In [7]:
i = 0
for col in categorical_features:
    a = train[col].value_counts()
    b = train[col].isnull().sum()
    if b > 0:
        train[col] = train[col].fillna("No")
        print(f'column:{col}: {a},\n null_sum = {b}, index :{i}')
    
    i = i+1

column:Alley: Alley
Grvl    50
Pave    41
Name: count, dtype: int64,
 null_sum = 1369, index :2
column:MasVnrType: MasVnrType
BrkFace    445
Stone      128
BrkCmn      15
Name: count, dtype: int64,
 null_sum = 872, index :17
column:BsmtQual: BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64,
 null_sum = 37, index :21
column:BsmtCond: BsmtCond
TA    1311
Gd      65
Fa      45
Po       2
Name: count, dtype: int64,
 null_sum = 37, index :22
column:BsmtExposure: BsmtExposure
No    953
Av    221
Gd    134
Mn    114
Name: count, dtype: int64,
 null_sum = 38, index :23
column:BsmtFinType1: BsmtFinType1
Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
Name: count, dtype: int64,
 null_sum = 37, index :24
column:BsmtFinType2: BsmtFinType2
Unf    1256
Rec      54
LwQ      46
BLQ      33
ALQ      19
GLQ      14
Name: count, dtype: int64,
 null_sum = 38, index :25
column:Electrical: Electrical
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix     

In [8]:
i = 0
for col in categorical_features_1:
    a = test[col].value_counts()
    b = test[col].isnull().sum()
    if b > 0:
        test[col] = test[col].fillna("No")
        print(f'column:{col}: {a},\n null_sum = {b}, index :{i}')
    
    i = i+1

column:MSZoning: MSZoning
RL         1114
RM          242
FV           74
C (all)      15
RH           10
Name: count, dtype: int64,
 null_sum = 4, index :0
column:Alley: Alley
Grvl    70
Pave    37
Name: count, dtype: int64,
 null_sum = 1352, index :2
column:Utilities: Utilities
AllPub    1457
Name: count, dtype: int64,
 null_sum = 2, index :5
column:Exterior1st: Exterior1st
VinylSd    510
MetalSd    230
HdBoard    220
Wd Sdng    205
Plywood    113
CemntBd     65
BrkFace     37
WdShing     30
AsbShng     24
Stucco      18
BrkComm      4
AsphShn      1
CBlock       1
Name: count, dtype: int64,
 null_sum = 1, index :15
column:Exterior2nd: Exterior2nd
VinylSd    510
MetalSd    233
HdBoard    199
Wd Sdng    194
Plywood    128
CmentBd     66
Wd Shng     43
BrkFace     22
Stucco      21
AsbShng     18
Brk Cmn     15
ImStucc      5
CBlock       2
AsphShn      1
Stone        1
Name: count, dtype: int64,
 null_sum = 1, index :16
column:MasVnrType: MasVnrType
BrkFace    434
Stone      121
BrkCm

In [9]:
i = 0
for col in numeric_features:
    a = train[col].value_counts()
    b = train[col].isnull().sum()
    if b > 0:
        train[col] = train[col].fillna(train[col].median())
        print(f'column:{col}: {a},\n null_sum = {b}, index :{i}')
    i = i+1

column:LotFrontage: LotFrontage
60.0     143
70.0      70
80.0      69
50.0      57
75.0      53
        ... 
137.0      1
141.0      1
38.0       1
140.0      1
46.0       1
Name: count, Length: 110, dtype: int64,
 null_sum = 259, index :1
column:MasVnrArea: MasVnrArea
0.0      861
180.0      8
72.0       8
108.0      8
120.0      7
        ... 
562.0      1
89.0       1
921.0      1
762.0      1
119.0      1
Name: count, Length: 327, dtype: int64,
 null_sum = 8, index :6


In [10]:
i = 0
for col in numeric_features_1:
    a = test[col].value_counts()
    b = test[col].isnull().sum()
    if b > 0:
        test[col] = test[col].fillna(test[col].median())
        print(f'column:{col}: {a},\n null_sum = {b}, index :{i}')
    i = i+1

column:LotFrontage: LotFrontage
60.0     133
80.0      68
70.0      63
50.0      60
75.0      52
        ... 
117.0      1
31.0       1
119.0      1
25.0       1
140.0      1
Name: count, Length: 115, dtype: int64,
 null_sum = 227, index :1
column:MasVnrArea: MasVnrArea
0.0       877
176.0      10
144.0       9
120.0       8
216.0       8
         ... 
647.0       1
1290.0      1
495.0       1
292.0       1
382.0       1
Name: count, Length: 303, dtype: int64,
 null_sum = 15, index :6
column:BsmtFinSF1: BsmtFinSF1
0.0       462
24.0       15
276.0       6
602.0       6
300.0       5
         ... 
1337.0      1
656.0       1
706.0       1
496.0       1
337.0       1
Name: count, Length: 669, dtype: int64,
 null_sum = 1, index :7
column:BsmtFinSF2: BsmtFinSF2
0.0      1278
483.0       3
162.0       3
294.0       3
144.0       2
         ... 
110.0       1
186.0       1
449.0       1
48.0        1
344.0       1
Name: count, Length: 161, dtype: int64,
 null_sum = 1, index :8
column:BsmtUnf

there is no NaN value now

# we will change string values into numbers:

In [11]:
for col in categorical_features:
    t = 0
    for i in train[col].unique():
        train[col] = train[col].apply(lambda x: t if x == i else x)
        #print(i)
        t = t+1

In [12]:
for col in categorical_features_1:
    t = 0
    for i in test[col].unique():
        test[col] = test[col].apply(lambda x: t if x == i else x)
        #print(i)
        t = t+1

# now lets remove outliers based on locaion and price per sqft

In [13]:
df = train.copy()

In [14]:
df['price_per_sqft'] = df['SalePrice']/df['LotArea']

In [15]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('Neighborhood'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

In [16]:
df2 = remove_pps_outliers(df)

In [17]:
df1 = df2.copy()

In [18]:
df2

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,SaleType,SaleCondition,SalePrice,price_per_sqft
0,60,0,65.0,8450,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,208500,24.674556
1,60,0,68.0,11250,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,223500,19.866667
2,20,0,91.0,10652,0,0,1,0,0,0,...,0,0,0,0,0,0,1,2,279500,26.239204
3,20,0,75.0,9742,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,230000,23.609115
4,20,0,85.0,11049,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,179900,16.282016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,50,0,50.0,9638,0,0,0,0,0,0,...,116,0,0,0,0,0,0,0,169000,17.534758
1091,70,0,51.0,9842,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,189000,19.203414
1092,30,0,52.0,6292,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,91000,14.462810
1093,50,0,51.0,6171,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,137450,22.273538


scaling values

In [19]:
scaler = MinMaxScaler()
df2[numeric_features] = scaler.fit_transform(df2[numeric_features])
df2[categorical_features] = scaler.fit_transform(df2[categorical_features])
df1[numeric_features_scale] = scaler.fit_transform(df1[numeric_features_scale])
df1[categorical_features] = scaler.fit_transform(df1[categorical_features])

In [20]:
x = df2.drop(['SalePrice','price_per_sqft'], axis=1)
y = df2['SalePrice']

In [21]:
x_s = df1.drop(['SalePrice','price_per_sqft'], axis=1)
y_s = df1['SalePrice']

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x_s, y_s, test_size=0.2, random_state=5)

creating model

In [23]:
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [24]:
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                               n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

In [25]:
rf_random.fit(x_train, y_train.values.ravel())

Fitting 3 folds for each of 100 candidates, totalling 300 fits


141 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
141 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chubbyshady\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chubbyshady\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\chubbyshady\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\chubbyshady\AppData\Local\Progra

In [26]:
print("Best parameters found:")
print(rf_random.best_params_)

Best parameters found:
{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}


In [27]:
rf = RandomForestRegressor(n_estimators=500, min_samples_split=2, min_samples_leaf=1,max_features='sqrt',max_depth=30)

In [28]:
y_pred = rf_random.predict(x_test)

In [29]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")

Mean Squared Error: 337035161.7784179
Root Mean Squared Error: 18358.51741776601
R-squared Score: 0.9190908829020956


In [30]:
rf_random.score(x_test,y_test)

0.9190908829020956

In [31]:
rf.fit(x_train,y_train)

In [32]:
rf.score(x_test,y_test)

0.9186691653306532

In [33]:
y_pred = rf.predict(test)

In [34]:
output = pd.DataFrame({'Id': ids,
                       'SalePrice': y_pred.squeeze()})

In [35]:
output

Unnamed: 0,Id,SalePrice
0,1461,334902.174
1,1462,364375.908
2,1463,381656.614
3,1464,390917.634
4,1465,344052.706
...,...,...
1454,2915,273260.910
1455,2916,352945.990
1456,2917,358643.366
1457,2918,276028.040


In [38]:
sub = pd.read_csv('.\\datasets\\sample_submission.csv')

In [39]:
sub

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


In [40]:
output.to_csv('submission.csv', index=False)