In [1]:
#importing libraries and modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df1 = df.drop(['SalePrice'],axis = 1)

In [4]:
df1.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [5]:
insignificant_col = list(df.corr()['SalePrice'][(df.corr()['SalePrice']<0.1) & (df.corr()['SalePrice']>-0.1)].index)

In [6]:
insignificant_col

['Id',
 'MSSubClass',
 'OverallCond',
 'BsmtFinSF2',
 'LowQualFinSF',
 'BsmtHalfBath',
 '3SsnPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [7]:
# removing features with low correlation
def drop_insignificant_col(data):
    data.drop(insignificant_col,axis =1,inplace = True)  
    return data

In [8]:
drop_insignificant_col(df1)

Unnamed: 0,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,Y,0,61,0,0,,,,WD,Normal
1,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,...,Y,298,0,0,0,,,,WD,Normal
2,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,Y,0,42,0,0,,,,WD,Normal
3,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,...,Y,0,35,272,0,,,,WD,Abnorml
4,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,...,Y,192,84,0,0,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,Y,0,40,0,0,,,,WD,Normal
1456,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,Y,349,0,0,0,,MnPrv,,WD,Normal
1457,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,Y,0,60,0,0,,GdPrv,Shed,WD,Normal
1458,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,Y,366,0,112,0,,,,WD,Normal


In [9]:
df1.isnull().sum().sort_values(ascending = False).head(20)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageCond        81
GarageFinish      81
GarageQual        81
GarageYrBlt       81
GarageType        81
BsmtFinType2      38
BsmtExposure      38
BsmtCond          37
BsmtQual          37
BsmtFinType1      37
MasVnrArea         8
MasVnrType         8
Electrical         1
2ndFlrSF           0
dtype: int64

In [10]:
# removing features having too many NaN values
def droping_high_missing_features(data):
    data.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','LotFrontage'],axis = 1,inplace = True)
    return data

In [11]:
droping_high_missing_features(df1)

Unnamed: 0,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,SaleType,SaleCondition
0,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,548,TA,TA,Y,0,61,0,0,WD,Normal
1,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,460,TA,TA,Y,298,0,0,0,WD,Normal
2,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,608,TA,TA,Y,0,42,0,0,WD,Normal
3,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,642,TA,TA,Y,0,35,272,0,WD,Abnorml
4,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,836,TA,TA,Y,192,84,0,0,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,460,TA,TA,Y,0,40,0,0,WD,Normal
1456,RL,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,...,500,TA,TA,Y,349,0,0,0,WD,Normal
1457,RL,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,...,252,TA,TA,Y,0,60,0,0,WD,Normal
1458,RL,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,240,TA,TA,Y,366,0,112,0,WD,Normal


In [12]:
from pandas.api.types import is_numeric_dtype

In [13]:
num_cols = []
cat_cols = []
for col in df1.columns:
    if (is_numeric_dtype(df1[col]) and df1[col].nunique() >= 25):
        num_cols.append(col)
    else:
        cat_cols.append(col)

In [14]:
#checking for high correlation among features
((df1.corr()>0.9) |(df1.corr()<-0.9)).sum()

LotArea          1
OverallQual      1
YearBuilt        1
YearRemodAdd     1
MasVnrArea       1
BsmtFinSF1       1
BsmtUnfSF        1
TotalBsmtSF      1
1stFlrSF         1
2ndFlrSF         1
GrLivArea        1
BsmtFullBath     1
FullBath         1
HalfBath         1
BedroomAbvGr     1
KitchenAbvGr     1
TotRmsAbvGrd     1
Fireplaces       1
GarageYrBlt      1
GarageCars       1
GarageArea       1
WoodDeckSF       1
OpenPorchSF      1
EnclosedPorch    1
ScreenPorch      1
dtype: int64

In [15]:
invariable_cat_cols = []
for col in cat_cols:
    if df1[col].value_counts().max()/df1.shape[0]*100 > 90:
        invariable_cat_cols.append(col) 

In [16]:
# removing categorical features which vary less frequently
def remove_invariable_cols(data):
    data.drop(invariable_cat_cols,axis = 1,inplace = True)
    return data

In [17]:
remove_invariable_cols(df1)

Unnamed: 0,MSZoning,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,...,GarageFinish,GarageCars,GarageArea,GarageQual,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,SaleType,SaleCondition
0,RL,8450,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,7,...,RFn,2,548,TA,0,61,0,0,WD,Normal
1,RL,9600,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,6,...,RFn,2,460,TA,298,0,0,0,WD,Normal
2,RL,11250,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,7,...,RFn,2,608,TA,0,42,0,0,WD,Normal
3,RL,9550,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,7,...,Unf,3,642,TA,0,35,272,0,WD,Abnorml
4,RL,14260,IR1,Lvl,FR2,NoRidge,Norm,1Fam,2Story,8,...,RFn,3,836,TA,192,84,0,0,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,7917,Reg,Lvl,Inside,Gilbert,Norm,1Fam,2Story,6,...,RFn,2,460,TA,0,40,0,0,WD,Normal
1456,RL,13175,Reg,Lvl,Inside,NWAmes,Norm,1Fam,1Story,6,...,Unf,2,500,TA,349,0,0,0,WD,Normal
1457,RL,9042,Reg,Lvl,Inside,Crawfor,Norm,1Fam,2Story,7,...,RFn,1,252,TA,0,60,0,0,WD,Normal
1458,RL,9717,Reg,Lvl,Inside,NAmes,Norm,1Fam,1Story,5,...,Unf,1,240,TA,366,0,112,0,WD,Normal


In [18]:
# filling the null values
def imputation(data):
    for col in num_cols:
        imp_value = data[col].mean()
        data[col] = data[col].fillna(imp_value)
    for col in cat_cols:
        if col not in invariable_cat_cols:
            imp_value = data[col].value_counts().idxmax()
            data[col] = data[col].fillna(imp_value)
    return data    

In [19]:
imputation(df1)

Unnamed: 0,MSZoning,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,...,GarageFinish,GarageCars,GarageArea,GarageQual,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,SaleType,SaleCondition
0,RL,8450,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,7,...,RFn,2,548,TA,0,61,0,0,WD,Normal
1,RL,9600,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,6,...,RFn,2,460,TA,298,0,0,0,WD,Normal
2,RL,11250,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,7,...,RFn,2,608,TA,0,42,0,0,WD,Normal
3,RL,9550,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,7,...,Unf,3,642,TA,0,35,272,0,WD,Abnorml
4,RL,14260,IR1,Lvl,FR2,NoRidge,Norm,1Fam,2Story,8,...,RFn,3,836,TA,192,84,0,0,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,7917,Reg,Lvl,Inside,Gilbert,Norm,1Fam,2Story,6,...,RFn,2,460,TA,0,40,0,0,WD,Normal
1456,RL,13175,Reg,Lvl,Inside,NWAmes,Norm,1Fam,1Story,6,...,Unf,2,500,TA,349,0,0,0,WD,Normal
1457,RL,9042,Reg,Lvl,Inside,Crawfor,Norm,1Fam,2Story,7,...,RFn,1,252,TA,0,60,0,0,WD,Normal
1458,RL,9717,Reg,Lvl,Inside,NAmes,Norm,1Fam,1Story,5,...,Unf,1,240,TA,366,0,112,0,WD,Normal


In [20]:
df2 = df.copy()

In [21]:
drop_insignificant_col(df2)
droping_high_missing_features(df2)
remove_invariable_cols(df2)
imputation(df2)

Unnamed: 0,MSZoning,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,...,GarageCars,GarageArea,GarageQual,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,SaleType,SaleCondition,SalePrice
0,RL,8450,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,7,...,2,548,TA,0,61,0,0,WD,Normal,208500
1,RL,9600,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,6,...,2,460,TA,298,0,0,0,WD,Normal,181500
2,RL,11250,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,7,...,2,608,TA,0,42,0,0,WD,Normal,223500
3,RL,9550,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,7,...,3,642,TA,0,35,272,0,WD,Abnorml,140000
4,RL,14260,IR1,Lvl,FR2,NoRidge,Norm,1Fam,2Story,8,...,3,836,TA,192,84,0,0,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,7917,Reg,Lvl,Inside,Gilbert,Norm,1Fam,2Story,6,...,2,460,TA,0,40,0,0,WD,Normal,175000
1456,RL,13175,Reg,Lvl,Inside,NWAmes,Norm,1Fam,1Story,6,...,2,500,TA,349,0,0,0,WD,Normal,210000
1457,RL,9042,Reg,Lvl,Inside,Crawfor,Norm,1Fam,2Story,7,...,1,252,TA,0,60,0,0,WD,Normal,266500
1458,RL,9717,Reg,Lvl,Inside,NAmes,Norm,1Fam,1Story,5,...,1,240,TA,366,0,112,0,WD,Normal,142125


In [22]:
#converting all categorical values to numbers
def target_encoding(data):
    for col in cat_cols:
        if col not in invariable_cat_cols:
            dict1 = df2.groupby(col)['SalePrice'].mean().to_dict()
            data[col] = data[col].map(dict1)
    return data        

In [23]:
target_encoding(df1)

Unnamed: 0,MSZoning,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,...,GarageFinish,GarageCars,GarageArea,GarageQual,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,SaleType,SaleCondition
0,191004.994787,8450,164754.818378,180183.746758,176938.047529,197965.773333,184495.492063,185763.807377,210051.764045,207716.423197,...,202068.869668,183851.663835,548,182591.864224,0,61,0,0,173401.836622,175202.219533
1,191004.994787,9600,164754.818378,180183.746758,177934.574468,238772.727273,142475.481481,185763.807377,175985.477961,161603.034759,...,202068.869668,183851.663835,460,182591.864224,298,0,0,0,173401.836622,175202.219533
2,191004.994787,11250,206101.665289,180183.746758,176938.047529,197965.773333,184495.492063,185763.807377,210051.764045,207716.423197,...,202068.869668,183851.663835,608,182591.864224,0,42,0,0,173401.836622,175202.219533
3,191004.994787,9550,206101.665289,180183.746758,181623.425856,210624.725490,184495.492063,185763.807377,210051.764045,207716.423197,...,137570.460641,309636.121547,642,182591.864224,0,35,272,0,173401.836622,146526.623762
4,191004.994787,14260,206101.665289,180183.746758,177934.574468,335295.317073,184495.492063,185763.807377,210051.764045,274735.535714,...,202068.869668,309636.121547,836,182591.864224,192,84,0,0,173401.836622,175202.219533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,191004.994787,7917,164754.818378,180183.746758,176938.047529,192854.506329,184495.492063,185763.807377,210051.764045,161603.034759,...,202068.869668,183851.663835,460,182591.864224,0,40,0,0,173401.836622,175202.219533
1456,191004.994787,13175,164754.818378,180183.746758,176938.047529,189050.068493,184495.492063,185763.807377,175985.477961,161603.034759,...,137570.460641,183851.663835,500,182591.864224,349,0,0,0,173401.836622,175202.219533
1457,191004.994787,9042,164754.818378,180183.746758,176938.047529,210624.725490,184495.492063,185763.807377,210051.764045,207716.423197,...,202068.869668,128116.688347,252,182591.864224,0,60,0,0,173401.836622,175202.219533
1458,191004.994787,9717,164754.818378,180183.746758,176938.047529,145847.080000,184495.492063,185763.807377,175985.477961,133523.347607,...,137570.460641,128116.688347,240,182591.864224,366,0,112,0,173401.836622,175202.219533


In [24]:
pipe = Pipeline([('classifier',XGBRegressor())])

In [25]:
grid_param = [
    {'classifier':[XGBRegressor()],
              'classifier__n_estimators':[50,100,200],
              'classifier__learning_rate':[0.03,0.1,0.3]           
    },    
    {'classifier':[Ridge()],
              'classifier__alpha':[0.01,0.3,1,3]               
    }
    
]

In [26]:
GScv2 = GridSearchCV(pipe,param_grid=grid_param,cv=5)

In [27]:
ft1 = FunctionTransformer(drop_insignificant_col)
ft2 = FunctionTransformer(droping_high_missing_features)
ft3 = FunctionTransformer(remove_invariable_cols)
ft4 = FunctionTransformer(imputation)
ft5 = FunctionTransformer(target_encoding)

In [28]:
pipe = make_pipeline(ft1,ft2,ft3,ft4,ft5,GScv2)

In [29]:
pipe.fit(df.drop('SalePrice',axis =1),df['SalePrice'])

In [30]:
df_test  = pd.read_csv('test.csv')

In [31]:
submission = df_test[['Id']]

In [32]:
pred = pipe.predict(df_test)

In [33]:
submission['SalePrice'] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['SalePrice'] = pred


In [34]:
submission

Unnamed: 0,Id,SalePrice
0,1461,121332.398438
1,1462,154711.140625
2,1463,186524.000000
3,1464,188210.796875
4,1465,188983.765625
...,...,...
1454,2915,78011.585938
1455,2916,84157.664062
1456,2917,147415.578125
1457,2918,118437.992188


In [35]:
submission.to_csv('submission2.csv',index = False)