In [326]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as XGB
from scipy.stats import norm, skew

In [327]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv').drop('Id',axis=1)
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')


In [328]:
test = test.apply(pd.to_numeric, errors='coerce')

In [329]:
test.isnull().sum()

In [330]:
test.dtypes

In [331]:
train

In [332]:
train.isnull().sum().tail(50)

In [333]:
for i in train:
    train.groupby(i).agg({'SalePrice' : 'sum'}).plot(figsize=(20,10))

In [334]:
corr = train.corr()
most_correlated_features = corr.index[abs(corr.SalePrice) > 0.5]
plt.figure(figsize = (15,10))
mask = np.triu(np.ones_like(train[most_correlated_features].corr(), dtype = np.bool))
g = sns.heatmap(train[most_correlated_features].corr(), annot = True, mask = mask)

In [335]:
cmap = sns.diverging_palette(230, 20, as_cmap=True)

f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(train.corr(), cmap=cmap)
f.autofmt_xdate()

In [336]:
df = pd.concat([train, test], axis = 0, sort = False)

In [337]:
df['MSZoning'] = df['MSZoning'].replace(['RL'],'1')
df['MSZoning'] = df['MSZoning'].replace(['RM'],'2')
df['MSZoning'] = df['MSZoning'].replace(['C (all)'],'3')
df['MSZoning'] = df['MSZoning'].replace(['FV'],'4')
df['MSZoning'] = df['MSZoning'].replace(['RH'],'5')
df['MSZoning'] = df['MSZoning'].replace(['I (all)'],'6')
df['MSZoning'] = df['MSZoning'].replace(['A (agr)'],'7')


In [338]:
df['Utilities'].unique()

In [339]:
df.isnull().sum().head(50)

In [340]:
df['Street'] = df['Street'].replace(['Pave'],'1')
df['Street'] = df['Street'].replace(['Grvl'],'2')

In [341]:
df['LotShape'] = df['LotShape'].replace(['Reg'],'1')
df['LotShape'] = df['LotShape'].replace(['IR1'],'2')
df['LotShape'] = df['LotShape'].replace(['IR2'],'3')
df['LotShape'] = df['LotShape'].replace(['IR3'],'4')

In [342]:
df['LandContour'] = df['LandContour'].replace(['Lvl'],'1')
df['LandContour'] = df['LandContour'].replace(['Bnk'],'2')
df['LandContour'] = df['LandContour'].replace(['Low'],'3')
df['LandContour'] = df['LandContour'].replace(['HLS'],'4')

In [343]:
df['Utilities'] = df['Utilities'].replace(['AllPub'],'1')
df['Utilities'] = df['Utilities'].replace(['NoSeWa'],'2')
df['Utilities'] = df['Utilities'].replace(['NoSewr'],'3')


In [344]:
df['LotConfig'] = df['LotConfig'].replace(['Inside'],'1')
df['LotConfig'] = df['LotConfig'].replace(['FR2'],'2')
df['LotConfig'] = df['LotConfig'].replace(['Corner'],'3')
df['LotConfig'] = df['LotConfig'].replace(['CulDSac'],'4')
df['LotConfig'] = df['LotConfig'].replace(['FR3'],'5')

In [345]:
df['LandSlope'] = df['LandSlope'].replace(['Gtl'],'1')
df['LandSlope'] = df['LandSlope'].replace(['Mod'],'2')
df['LandSlope'] = df['LandSlope'].replace(['Sev'],'3')

In [346]:
df['Neighborhood'] = df['Neighborhood'].replace(['CollgCr'],'1')
df['Neighborhood'] = df['Neighborhood'].replace(['Veenker'],'2')
df['Neighborhood'] = df['Neighborhood'].replace(['Crawfor'],'3')
df['Neighborhood'] = df['Neighborhood'].replace(['NoRidge'],'4')
df['Neighborhood'] = df['Neighborhood'].replace(['Mitchel'],'5')
df['Neighborhood'] = df['Neighborhood'].replace(['Somerst'],'6')
df['Neighborhood'] = df['Neighborhood'].replace(['NWAmes'],'7')
df['Neighborhood'] = df['Neighborhood'].replace(['OldTown'],'8')
df['Neighborhood'] = df['Neighborhood'].replace(['BrkSide'],'9')
df['Neighborhood'] = df['Neighborhood'].replace(['Sawyer'],'10')
df['Neighborhood'] = df['Neighborhood'].replace(['NridgHt'],'11')
df['Neighborhood'] = df['Neighborhood'].replace(['NAmes'],'12')
df['Neighborhood'] = df['Neighborhood'].replace(['SawyerW'],'13')
df['Neighborhood'] = df['Neighborhood'].replace(['IDOTRR'],'14')
df['Neighborhood'] = df['Neighborhood'].replace(['MeadowV'],'15')
df['Neighborhood'] = df['Neighborhood'].replace(['Edwards'],'16')
df['Neighborhood'] = df['Neighborhood'].replace(['Timber'],'17')
df['Neighborhood'] = df['Neighborhood'].replace(['Gilbert'],'18')
df['Neighborhood'] = df['Neighborhood'].replace(['StoneBr'],'19')
df['Neighborhood'] = df['Neighborhood'].replace(['ClearCr'],'20')
df['Neighborhood'] = df['Neighborhood'].replace(['NPkVill'],'21')
df['Neighborhood'] = df['Neighborhood'].replace(['Blmngtn'],'22')
df['Neighborhood'] = df['Neighborhood'].replace(['BrDale'],'23')
df['Neighborhood'] = df['Neighborhood'].replace(['SWISU'],'24')
df['Neighborhood'] = df['Neighborhood'].replace(['Blueste'],'25')
df['Neighborhood'] = df['Neighborhood'].replace(['Greens'],'26')
df['Neighborhood'] = df['Neighborhood'].replace(['GrnHill'],'27')
df['Neighborhood'] = df['Neighborhood'].replace(['Landmrk'],'28')

In [347]:
df['Condition1'] = df['Condition1'].replace(['Norm'],'1')
df['Condition1'] = df['Condition1'].replace(['Feedr'],'2')
df['Condition1'] = df['Condition1'].replace(['PosN'],'3')
df['Condition1'] = df['Condition1'].replace(['Artery'],'4')
df['Condition1'] = df['Condition1'].replace(['RRAe'],'5')
df['Condition1'] = df['Condition1'].replace(['RRNn'],'6')
df['Condition1'] = df['Condition1'].replace(['RRAn'],'7')
df['Condition1'] = df['Condition1'].replace(['PosA'],'8')
df['Condition1'] = df['Condition1'].replace(['RRNe'],'9')

In [348]:
df['Condition2'] = df['Condition2'].replace(['Norm'],'1')
df['Condition2'] = df['Condition2'].replace(['Feedr'],'2')
df['Condition2'] = df['Condition2'].replace(['PosN'],'3')
df['Condition2'] = df['Condition2'].replace(['Artery'],'4')
df['Condition2'] = df['Condition2'].replace(['RRAe'],'5')
df['Condition2'] = df['Condition2'].replace(['RRNn'],'6')
df['Condition2'] = df['Condition2'].replace(['RRAn'],'7')
df['Condition2'] = df['Condition2'].replace(['PosA'],'8')

In [349]:
df['BldgType'] = df['BldgType'].replace(['1Fam'],'1')
df['BldgType'] = df['BldgType'].replace(['2fmCon'],'2')
df['BldgType'] = df['BldgType'].replace(['Duplex'],'3')
df['BldgType'] = df['BldgType'].replace(['TwnhsE'],'4')
df['BldgType'] = df['BldgType'].replace(['Twnhs'],'5')

In [350]:
df['HouseStyle'] = df['HouseStyle'].replace(['2Story'],'1')
df['HouseStyle'] = df['HouseStyle'].replace(['1Story'],'2')
df['HouseStyle'] = df['HouseStyle'].replace(['1.5Fin'],'3')
df['HouseStyle'] = df['HouseStyle'].replace(['1.5Unf'],'4')
df['HouseStyle'] = df['HouseStyle'].replace(['SFoyer'],'5')
df['HouseStyle'] = df['HouseStyle'].replace(['SLvl'],'6')
df['HouseStyle'] = df['HouseStyle'].replace(['2.5Unf'],'7')
df['HouseStyle'] = df['HouseStyle'].replace(['2.5Fin'],'8')

In [351]:
df['RoofStyle'] = df['RoofStyle'].replace(['Gable'],'1')
df['RoofStyle'] = df['RoofStyle'].replace(['Hip'],'2')
df['RoofStyle'] = df['RoofStyle'].replace(['Gambrel'],'3')
df['RoofStyle'] = df['RoofStyle'].replace(['Mansard'],'4')
df['RoofStyle'] = df['RoofStyle'].replace(['Flat'],'5')
df['RoofStyle'] = df['RoofStyle'].replace(['Shed'],'6')

In [352]:
df['RoofMatl'] = df['RoofMatl'].replace(['CompShg'],'1')
df['RoofMatl'] = df['RoofMatl'].replace(['WdShngl'],'2')
df['RoofMatl'] = df['RoofMatl'].replace(['Metal'],'3')
df['RoofMatl'] = df['RoofMatl'].replace(['WdShake'],'4')
df['RoofMatl'] = df['RoofMatl'].replace(['Membran'],'5')
df['RoofMatl'] = df['RoofMatl'].replace(['Tar&Grv'],'6')
df['RoofMatl'] = df['RoofMatl'].replace(['Roll'],'7')
df['RoofMatl'] = df['RoofMatl'].replace(['ClyTile'],'8')

In [353]:
df['Exterior1st'] = df['Exterior1st'].replace(['VinylSd'],'1')
df['Exterior1st'] = df['Exterior1st'].replace(['MetalSd'],'2')
df['Exterior1st'] = df['Exterior1st'].replace(['Wd Sdng'],'3')
df['Exterior1st'] = df['Exterior1st'].replace(['HdBoard'],'4')
df['Exterior1st'] = df['Exterior1st'].replace(['BrkFace'],'5')
df['Exterior1st'] = df['Exterior1st'].replace(['WdShing'],'6')
df['Exterior1st'] = df['Exterior1st'].replace(['CemntBd'],'7')
df['Exterior1st'] = df['Exterior1st'].replace(['Plywood'],'8')
df['Exterior1st'] = df['Exterior1st'].replace(['AsbShng'],'9')
df['Exterior1st'] = df['Exterior1st'].replace(['Stucco'],'10')
df['Exterior1st'] = df['Exterior1st'].replace(['BrkComm'],'11')
df['Exterior1st'] = df['Exterior1st'].replace(['AsphShn'],'12')
df['Exterior1st'] = df['Exterior1st'].replace(['Stone'],'13')
df['Exterior1st'] = df['Exterior1st'].replace(['ImStucc'],'14')
df['Exterior1st'] = df['Exterior1st'].replace(['CBlock'],'15')
df['Exterior1st'] = df['Exterior1st'].replace(['PreCast'],'16')


In [354]:
df['Exterior2nd'] = df['Exterior2nd'].replace(['VinylSd'],'1')
df['Exterior2nd'] = df['Exterior2nd'].replace(['MetalSd'],'2')
df['Exterior2nd'] = df['Exterior2nd'].replace(['Wd Sdng'],'3')
df['Exterior2nd'] = df['Exterior2nd'].replace(['HdBoard'],'4')
df['Exterior2nd'] = df['Exterior2nd'].replace(['BrkFace'],'5')
df['Exterior2nd'] = df['Exterior2nd'].replace(['Wd Shng'],'6')
df['Exterior2nd'] = df['Exterior2nd'].replace(['CmentBd'],'7')
df['Exterior2nd'] = df['Exterior2nd'].replace(['Plywood'],'8')
df['Exterior2nd'] = df['Exterior2nd'].replace(['AsbShng'],'9')
df['Exterior2nd'] = df['Exterior2nd'].replace(['Stucco'],'10')
df['Exterior2nd'] = df['Exterior2nd'].replace(['Brk Cmn'],'11')
df['Exterior2nd'] = df['Exterior2nd'].replace(['AsphShn'],'12')
df['Exterior2nd'] = df['Exterior2nd'].replace(['Stone'],'13')
df['Exterior2nd'] = df['Exterior2nd'].replace(['ImStucc'],'14')
df['Exterior2nd'] = df['Exterior2nd'].replace(['CBlock'],'15')
df['Exterior2nd'] = df['Exterior2nd'].replace(['Other'],'16')
df['Exterior2nd'] = df['Exterior2nd'].replace(['PreCast'],'17')


In [355]:
df['ExterQual'] = df['ExterQual'].replace(['Gd'],'1')
df['ExterQual'] = df['ExterQual'].replace(['TA'],'2')
df['ExterQual'] = df['ExterQual'].replace(['Ex'],'3')
df['ExterQual'] = df['ExterQual'].replace(['Fa'],'4')

In [356]:
df['ExterCond'] = df['ExterCond'].replace(['Gd'],'1')
df['ExterCond'] = df['ExterCond'].replace(['TA'],'2')
df['ExterCond'] = df['ExterCond'].replace(['Ex'],'3')
df['ExterCond'] = df['ExterCond'].replace(['Fa'],'4')
df['ExterCond'] = df['ExterCond'].replace(['Po'],'5')

In [357]:
df['Foundation'] = df['Foundation'].replace(['PConc'],'1')
df['Foundation'] = df['Foundation'].replace(['CBlock'],'2')
df['Foundation'] = df['Foundation'].replace(['BrkTil'],'3')
df['Foundation'] = df['Foundation'].replace(['Wood'],'4')
df['Foundation'] = df['Foundation'].replace(['Slab'],'5')
df['Foundation'] = df['Foundation'].replace(['Stone'],'6')

In [358]:
df['Heating'] = df['Heating'].replace(['GasA'],'1')
df['Heating'] = df['Heating'].replace(['GasW'],'2')
df['Heating'] = df['Heating'].replace(['Grav'],'3')
df['Heating'] = df['Heating'].replace(['Wall'],'4')
df['Heating'] = df['Heating'].replace(['OthW'],'5')
df['Heating'] = df['Heating'].replace(['Floor'],'6')

In [359]:
df['HeatingQC'] = df['HeatingQC'].replace(['Ex'],'1')
df['HeatingQC'] = df['HeatingQC'].replace(['Gd'],'2')
df['HeatingQC'] = df['HeatingQC'].replace(['TA'],'3')
df['HeatingQC'] = df['HeatingQC'].replace(['Fa'],'4')
df['HeatingQC'] = df['HeatingQC'].replace(['Po'],'5')

In [360]:
df['CentralAir'] = df['CentralAir'].replace(['Y'],'1')
df['CentralAir'] = df['CentralAir'].replace(['N'],'2')

In [361]:
df['KitchenQual'] = df['KitchenQual'].replace(['Ex'],'1')
df['KitchenQual'] = df['KitchenQual'].replace(['Gd'],'2')
df['KitchenQual'] = df['KitchenQual'].replace(['TA'],'3')
df['KitchenQual'] = df['KitchenQual'].replace(['Fa'],'4')
df['KitchenQual'] = df['KitchenQual'].replace(['Po'],'5')

In [362]:
df['Functional'] = df['Functional'].replace(['Typ'],'1')
df['Functional'] = df['Functional'].replace(['Min1'],'2')
df['Functional'] = df['Functional'].replace(['Maj1'],'3')
df['Functional'] = df['Functional'].replace(['Min2'],'4')
df['Functional'] = df['Functional'].replace(['Mod'],'5')
df['Functional'] = df['Functional'].replace(['Maj2'],'6')
df['Functional'] = df['Functional'].replace(['Sev'],'7')
df['Functional'] = df['Functional'].replace(['Sal'],'8')

In [363]:
df['PavedDrive'] = df['PavedDrive'].replace(['Y'],'1')
df['PavedDrive'] = df['PavedDrive'].replace(['N'],'2')
df['PavedDrive'] = df['PavedDrive'].replace(['P'],'3')

In [364]:
df['SaleType'] = df['SaleType'].replace(['WD '],'1')
df['SaleType'] = df['SaleType'].replace(['New'],'2')
df['SaleType'] = df['SaleType'].replace(['COD'],'3')
df['SaleType'] = df['SaleType'].replace(['ConLD'],'4')
df['SaleType'] = df['SaleType'].replace(['ConLI'],'5')
df['SaleType'] = df['SaleType'].replace(['CWD'],'6')
df['SaleType'] = df['SaleType'].replace(['ConLw'],'7')
df['SaleType'] = df['SaleType'].replace(['Con'],'8')
df['SaleType'] = df['SaleType'].replace(['Oth'],'9')
df['SaleType'] = df['SaleType'].replace(['VWD'],'10')


In [365]:
df['SaleCondition'] = df['SaleCondition'].replace(['Normal'],'1')
df['SaleCondition'] = df['SaleCondition'].replace(['Abnorml'],'2')
df['SaleCondition'] = df['SaleCondition'].replace(['Partial'],'3')
df['SaleCondition'] = df['SaleCondition'].replace(['AdjLand'],'4')
df['SaleCondition'] = df['SaleCondition'].replace(['Alloca'],'5')
df['SaleCondition'] = df['SaleCondition'].replace(['Family'],'6')

In [366]:
df.isnull().sum()

In [367]:
Missing_Values = df.isnull().sum().sort_values(ascending=False)
Percentage = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([Missing_Values, Percentage], axis=1, keys=['Missing_Values', 'Percentage'])
missing_data.head(30)

In [368]:
df.drop((missing_data[missing_data['Missing_Values'] > 5]).index, axis = 1, inplace = True)

In [369]:
df.isnull().sum().sort_values(ascending=False).head(50)

In [370]:
num_col = ['BsmtFinSF1',
          'BsmtFinSF2',
          'BsmtUnfSF',
          'BsmtFullBath',
          'BsmtHalfBath',
          'GarageArea',
          'GarageCars',
          'TotalBsmtSF']

for feature in num_col:
    df[feature] = df[feature].fillna(0)

In [371]:
numeric_inputs = df.dtypes[df.dtypes != 'object'].index
skewed = df[numeric_inputs].apply(lambda x: skew(x)).sort_values(ascending = False)
print(skewed[abs(skewed > 0.5)])

In [372]:
highskewed = skewed[abs(skewed) > 0.5]

for feature in highskewed.index:
    df[feature] = np.log1p(df[feature])

In [373]:
df.isnull().sum().tail(50)

In [374]:
df.dtypes

In [375]:
df.isnull().sum().sort_values(ascending=False).head(50)

In [376]:
y_train = train.SalePrice
X_train = df[:len(y_train)]
X_test = df[len(y_train):]

In [377]:
X_test.isnull().sum()

In [378]:
X_test

In [379]:
len(X_test)

In [380]:
len(y_train)

In [381]:
len(X_train)

In [382]:
X_test

In [383]:
XGBRegressor = XGB.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, random_state =7, nthread = -1)
XGBRegressor.fit(X_train, y_train)

In [384]:
y_pred =XGBRegressor.predict(X_test)
y_pred

In [385]:
#0.14187
output = pd.DataFrame({'Id': test['Id'], 
                       'SalePrice': y_pred})
output.to_csv('submission.csv', index=False)


In [386]:
y_pred