In [208]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [209]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

In [210]:
train_data=pd.read_csv('train.csv')
train_data.sample(8)
target = train_data.iloc[:,-1]
train_data= train_data.iloc[:,:-1]

In [211]:
train_data.shape

(1460, 80)

In [212]:
train_data = train_data.drop(columns=['Alley', 'Id'],axis=1)
for i in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    print(train_data[i].unique())

['Gd' 'TA' 'Ex' nan 'Fa']
['TA' 'Gd' nan 'Fa' 'Po']
['No' 'Gd' 'Mn' 'Av' nan]
['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ']
['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]
['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types']
['RFn' 'Unf' 'Fin' nan]
['TA' 'Fa' 'Gd' nan 'Ex' 'Po']
['TA' 'Fa' nan 'Gd' 'Po' 'Ex']


In [213]:
removed_cols = train_data.loc[:,train_data.isna().sum(axis=0)/len(train_data)*100 >= 10].columns
train_data = train_data.drop(columns=removed_cols, axis='columns')
removed_cols

Index(['LotFrontage', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')

In [214]:
columns_with_null=train_data.loc[:,(train_data.isna().sum(axis=0)>0)].columns

In [215]:
columns_with_null

Index(['MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Electrical', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond'],
      dtype='object')

In [216]:
train_data["BsmtCond"].dtype

dtype('O')

In [217]:
# l1 list contain columns with Object datatype
# l2 list contain columns with numerical datatype
l1,l2=[],[]
for col in columns_with_null:
    if train_data[col].dtype == 'O':
        l1.append(col)
    else:
        l2.append(col)


In [218]:
print(l1)
print(l2)

['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
['MasVnrArea', 'GarageYrBlt']


In [219]:
for i in l1:
    print(train_data[i].unique())

['Gd' 'TA' 'Ex' nan 'Fa']
['TA' 'Gd' nan 'Fa' 'Po']
['No' 'Gd' 'Mn' 'Av' nan]
['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ']
['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]
['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types']
['RFn' 'Unf' 'Fin' nan]
['TA' 'Fa' 'Gd' nan 'Ex' 'Po']
['TA' 'Fa' nan 'Gd' 'Po' 'Ex']


In [220]:
train_data.loc[:,l1].dtypes

BsmtQual        object
BsmtCond        object
BsmtExposure    object
BsmtFinType1    object
BsmtFinType2    object
Electrical      object
GarageType      object
GarageFinish    object
GarageQual      object
GarageCond      object
dtype: object

In [221]:
train_data.loc[:,l2].dtypes

MasVnrArea     float64
GarageYrBlt    float64
dtype: object

In [222]:
train_data['BsmtQual'].value_counts()

BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64

In [223]:
np.sort(train_data.columns)
print(l2,l1)

['MasVnrArea', 'GarageYrBlt'] ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']


In [224]:
# l3 list contain ordinal columns
# Complete this l3 list and only write map for the columns' name given above, don't touch anything if you don;t know about that and don't directly merge your branch into master
l3 = ['MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope','BldgType','HouseStyle','RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd','ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC','PavedDrive' ,'GarageCond', 'GarageQual','KitchenQual', 'Functional', 'GarageType','GarageFinish']
maps = [
        
	    ['A','C (all)','FV','I','RH','RL','RP','RM'],
        ['Grvl','Pave'],
        ['IR3', 'IR2', 'IR1', 'Reg'],
        ['Low', 'HLS', 'Bnk', 'Lvl'],
        ['ELO', 'NoSeWa', 'NoSewr', 'AllPub'],
        ['FR3', 'FR2', 'CulDSac', 'Corner', 'Inside'],
        ['Sev', 'Mod', 'Gtl'],
        ['1Fam','2fmCon','Duplex','TwnhsE','Twnhs'],
        ['1Story','1.5Fin','1.5Unf','2Story','2.5Fin','2.5Unf','SFoyer','SLvl'],
	    ['Mansard', 'Gambrel', 'Flat', 'Shed', 'Gable', 'Hip'],
    	['Roll', 'Membran', 'Tar&Grv', 'CompShg', 'WdShngl', 'WdShake', 'Metal', 'ClyTile'],
    	['ImStucc', 'CBlock', 'BrkComm', 'AsphShn', 'AsbShng', 'WdShing', 'Wd Sdng', 'MetalSd', 'Plywood', 'HdBoard', 'VinylSd', 'Stucco', 'CemntBd', 'BrkFace', 'Stone'],
    	['Other', 'ImStucc', 'CBlock', 'Brk Cmn', 'AsphShn', 'AsbShng', 'Wd Shng', 'Wd Sdng', 'MetalSd', 'Plywood', 'HdBoard', 'VinylSd', 'Stucco', 'CmentBd', 'BrkFace', 'Stone'],
       	['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    	['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    	['Stone', 'Slab', 'CBlock', 'BrkTil', 'Wood', 'PConc'],
    	['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    	['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    	['No', 'Mn', 'Av', 'Gd'],
    	['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    	['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    	["NA","Po","Fa","TA","Gd","Ex"],
	    ['N', 'P','Y'],
        ["NA","Po","Fa","TA","Gd","Ex"],
        ["NA","Po","Fa","TA","Gd","Ex"],
     	 ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    ['NA', 'Detchd', 'CarPort', 'BuiltIn', 'Basment', 'Attchd', '2Types'],
    ['NA', 'Unf', 'RFn', 'Fin'] 
    ]

In [225]:
len(maps), len(l3)

(29, 29)

In [226]:
for i in l3:
    print(i, train_data[i].unique())
train_data['MSZoning'].value_counts()

MSZoning ['RL' 'RM' 'C (all)' 'FV' 'RH']
Street ['Pave' 'Grvl']
LotShape ['Reg' 'IR1' 'IR2' 'IR3']
LandContour ['Lvl' 'Bnk' 'Low' 'HLS']
Utilities ['AllPub' 'NoSeWa']
LotConfig ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
LandSlope ['Gtl' 'Mod' 'Sev']
BldgType ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
HouseStyle ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
RoofStyle ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
RoofMatl ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']
Exterior1st ['VinylSd' 'MetalSd' 'Wd Sdng' 'HdBoard' 'BrkFace' 'WdShing' 'CemntBd'
 'Plywood' 'AsbShng' 'Stucco' 'BrkComm' 'AsphShn' 'Stone' 'ImStucc'
 'CBlock']
Exterior2nd ['VinylSd' 'MetalSd' 'Wd Shng' 'HdBoard' 'Plywood' 'Wd Sdng' 'CmentBd'
 'BrkFace' 'Stucco' 'AsbShng' 'Brk Cmn' 'ImStucc' 'AsphShn' 'Stone'
 'Other' 'CBlock']
ExterQual ['Gd' 'TA' 'Ex' 'Fa']
ExterCond ['TA' 'Gd' 'Fa' 'Po' 'Ex']
Foundation ['PConc' 'CBlock' 'BrkTil' 'Wood' 'Slab' 'Stone']
BsmtQu

MSZoning
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: count, dtype: int64

In [227]:
# l4 list contain nominal columns
# l5 list contains numerical columns
l4 = []
l5 = []
for col in train_data.columns:
    if train_data[col].dtype == 'O':
        if col not in l3:
            l4.append(col)
    else:
        l5.append(col)

In [228]:
# impute then encode
# transformer = ColumnTransformer(transformers=[
#     ('tnf1',SimpleImputer(strategy="mean"),l2),
#     ('tnf2',SimpleImputer(strategy="most_frequent"),l1),
# ],remainder='passthrough')
# train_data

In [229]:
# transformed_train_data = pd.DataFrame(transformer.fit_transform(train_data),columns=train_data.columns)


In [230]:
# transformed_train_data

In [231]:
train_data.isna().sum().sum()
l3
train_data['MSZoning'].unique()

array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object)

In [232]:
for i in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    print(train_data[i].unique())

['Gd' 'TA' 'Ex' nan 'Fa']
['TA' 'Gd' nan 'Fa' 'Po']
['No' 'Gd' 'Mn' 'Av' nan]
['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ']
['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]
['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types']
['RFn' 'Unf' 'Fin' nan]
['TA' 'Fa' 'Gd' nan 'Ex' 'Po']
['TA' 'Fa' nan 'Gd' 'Po' 'Ex']


In [233]:
dict_ = {}
for i in l2:
    dict_[i] = train_data[i].mean().round(2)
for i in l1:
    dict_[i] = train_data[i].mode().values[0]

train_data.fillna(dict_, inplace=True)

In [234]:
print(train_data['BsmtQual'].mode().values[0])

TA


In [235]:
transformer2 = ColumnTransformer([
    ('tnf1',OrdinalEncoder(categories=maps),l3),
    ('tnf2',OneHotEncoder(sparse_output=False,drop='first'),l4),
],remainder='passthrough')

In [236]:
# m = 0
# for i in l3:
#     print(train_data[i].value_counts(),"    ", maps[m])
#     m+=1

In [237]:
new_data = transformer2.fit_transform(train_data) 

In [238]:
new_data

array([[5.000e+00, 1.000e+00, 3.000e+00, ..., 0.000e+00, 2.000e+00,
        2.008e+03],
       [5.000e+00, 1.000e+00, 3.000e+00, ..., 0.000e+00, 5.000e+00,
        2.007e+03],
       [5.000e+00, 1.000e+00, 2.000e+00, ..., 0.000e+00, 9.000e+00,
        2.008e+03],
       ...,
       [5.000e+00, 1.000e+00, 3.000e+00, ..., 2.500e+03, 5.000e+00,
        2.010e+03],
       [5.000e+00, 1.000e+00, 3.000e+00, ..., 0.000e+00, 4.000e+00,
        2.010e+03],
       [5.000e+00, 1.000e+00, 3.000e+00, ..., 0.000e+00, 6.000e+00,
        2.008e+03]])

In [239]:
len(new_data[0])

126

In [240]:
# from sklearn.linear_model import LinearRegression

In [241]:
# lr = LinearRegression()
# model = lr.fit(new_data,target)

In [242]:
from sklearn.ensemble import RandomForestRegressor

In [243]:
rf = RandomForestRegressor(n_estimators=10, random_state=0)

In [244]:
from sklearn.model_selection import train_test_split

In [245]:
x_train, x_test, y_train, y_test = train_test_split(new_data, target, random_state=20, test_size=0.3)

In [246]:
x_train.shape, y_train.shape

((1022, 126), (1022,))

In [247]:
model = rf.fit(x_train, y_train)

In [248]:
y_pred = model.predict(x_test)

In [249]:
from sklearn.metrics import root_mean_squared_error

In [250]:
root_mean_squared_error(y_test, y_pred)

33633.190614729596

In [251]:
test = pd.read_csv('test.csv')

In [252]:
test = test.drop("Alley",axis=1)

In [253]:
test = test.drop(columns=removed_cols, axis='columns')

In [254]:
columns_with_null=test.loc[:,(test.isna().sum(axis=0)>0)].columns
l1=[]
l2=[]
for col in columns_with_null:
    if train_data[col].dtype == 'O':
        l1.append(col)
    else:
        l2.append(col)
        
# for i in l2:
#     test[i].fillna(test[i].mean(),inplace=True)
# for i in l1:
#     test[i].fillna(test[i].mode().values[0],inplace=True)

dict_ = {}
for i in l2:
    dict_[i] = test[i].mean().round(2)
for i in l1:
    dict_[i] = test[i].mode().values[0]

test.fillna(dict_, inplace=True)

test.isna().sum().sum()


0

In [255]:
test_transformed = transformer2.transform(test)

In [256]:
len(test_transformed[0])

126

In [257]:
pred = model.predict(test_transformed)

In [258]:
submission = pd.DataFrame({'Id':test['Id'], 'SalePrice': pred}).reset_index().iloc[:,1:]

In [259]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,117900.0
1,1462,155965.0
2,1463,176618.5
3,1464,166140.0
4,1465,193355.6


In [260]:
submission.to_csv('submission.csv',index=False)

In [261]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,117900.0
1,1462,155965.0
2,1463,176618.5
3,1464,166140.0
4,1465,193355.6
