In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn import datasets, ensemble, metrics  
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import datetime as dt

In [None]:
# Read Train/Test files 
train=pd.read_csv('../input/house-prices-advanced-regression-techniques/housetrain.csv')
test= pd.read_csv('../input/house-prices-data/test.csv')

In [None]:
# Check sample of data

#test.head()
train.head()

In [None]:
#Check columns


#test.columns
train.columns

In [None]:
# Check Shape
train.shape
test.shape

In [None]:
# Check the missing values in Train/Test

train.isnull().sum().sort_values().tail(25)
test.isnull().sum().sort_values().tail(25)

In [None]:
# Let's check missing values on heatmap
sns.heatmap(train.isnull(), yticklabels = False, cbar= False)
#sns.heatmap(test.isnull(), yticklabels = False, cbar= False)

In [None]:
test['GarageArea'].value_counts()

In [None]:
# DIFFERENTIATING Columns

# All categorical columns
cat_cols = [col for col in train.columns if train[col].dtype == "object"]

# All numerical columns
num_cols = [col for col in train.columns if train[col].dtype != "object"]

print('Numeric columns:', num_cols)


print('Categorical columns:', cat_cols)

In [None]:
final_df= pd.concat([train,test],axis=0)
final_df.shape

In [None]:
# Differentiate Categorical variables
cat_final=final_df[cat_cols]


cat_final.shape

In [None]:
# Drop categorical columns

final_df.drop(cat_cols , axis = 1, inplace = True)


final_df.shape


In [None]:
# Impute Numerical missing values with Mean

imputerr= SimpleImputer(strategy = 'mean')

num_final = pd.DataFrame(imputerr.fit_transform(final_df))


# imputation removed column names, put them back
num_final.columns = final_df.columns


num_final.shape


In [None]:
# Apply LabelEncoder to each column with categorical data

#cat_final=cat_final.astype(str)


my_imputer = SimpleImputer(strategy='most_frequent')
imputed_cat = pd.DataFrame(my_imputer.fit_transform(cat_final))


imputed_cat.columns = cat_final.columns


# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_final = pd.DataFrame(OH_encoder.fit_transform(imputed_cat))



OH_cols_final.index = imputed_cat.index


OH_cols_final= OH_cols_final.reset_index(drop=True)


OH_cols_final.shape


In [None]:
# Combine Numeric & Categorical Datasets

final = pd.concat([num_final, OH_cols_final], axis= 1)

final.shape

In [None]:
# Null confirmation
final.isnull().sum().sort_values().tail(25)

In [None]:
# Split back to train/ test
train=final.iloc[:1460,:]
test=final.iloc[1460:,:]

In [None]:
train.shape
test.shape

In [None]:
#Separate Target Variable 

y= train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace = True)
test.drop(['SalePrice'], axis=1, inplace = True)

In [None]:
y

In [None]:
#(OPTIONAL)For Interactive purposes only!!

def timer(start_time = None):
    d= dt.datetime.now()
    if not start_time:
        start_time= d
        return start_time
    elif start_time:
        thour, temp_sec= divmod((d-start_time).total_seconds(),3600)
        tmin,tsec= divmod(temp_sec, 60)
        
        print('\n Time Taken :  %i Hours %i Minutes %i Seconds.' %(thour, tmin, round(tsec,2)))

In [None]:
# Define model
my_model= XGBRegressor()

predictors= train.columns
xgb1 = XGBRegressor(
 objective='reg:squarederror',
 learning_rate =0.01,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
#  tree_method='gpu_hist',
    
 booster='gbtree', 
 nthread=4,
 scale_pos_weight=1,
 seed=20)
modelfit(xgb1, train, predictors, y)

def modelfit(alg, dtrain, predictors,target, useTrainCV=True, cv_folds=5, early=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=target)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds= 50, metrics={'rmse'})
        print(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], target,eval_metric='rmse')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    #dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("MSE : {}".format(metrics.mean_squared_error(target, dtrain_predictions)))
    #print ("RMSE Score (Train): %f" % metrics.roc_auc_score(target, dtrain_predprob))
                    
    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
train.columns

In [None]:
# Define Param set

params={
    'learning_rate'    :  [0.01,0.1,0.2],
    'max_depth'        :  [2,8,16],
    'min_child_weight' :  [1,5,8],
    'subsample'        :  [0.7,0.85,1.0],
    'gamma'            :  [0,0.1,0.2],
    'colsample_bytree' :  [0.1,0.5,0.9],
    'n_estimators'     :  [100,500,900,1400],
    'booster'          :  ['gbtree'],
    'criterion'        :  ['gini','entropy']
}

In [None]:
# I used GridSearch. RandomizedSearch also gives good result
params={
    'learning_rate'    :  [0.1],
    'n_estimators'     :  [300],
    'max_depth':          [5],
    'min_child_weight':   [3],
    'gamma':              [0.05],
    'subsample':          [0.6],
    'colsample_bytree':   [0.55],
    'reg_alpha':[0,0.001,0.08,1,1.25,1.5.2.0],
    'reg_lambda':[0,0.001,0.08,1,1.25,1.5.2.0]
    
    
    
}

grid_srch = GridSearchCV(estimator=my_model, param_grid=params,
                              cv=10, 
                              scoring='neg_root_mean_squared_error',
                              n_jobs=-1, verbose=5, return_train_score= True)

In [None]:
# Fit the Search algo
start_time= timer(None)

grid_srch.fit(train,y)

timer(start_time)

In [None]:
#Find Best Hyperparameers
# cv=10, Gridsearch
grid_srch.best_estimator_

In [None]:
#Define model with best Hyperparameters
my_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, criterion='gini',
             gamma=0.15, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.2, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None,
             monotone_constraints='()', n_estimators=200, n_jobs=0,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1.0, tree_method='exact',
             validate_parameters=1, verbosity=None)

# Fit the model

my_model.fit(train, y)

# Get predictions
predictions = my_model.predict(test)

In [None]:
# SUBMISSION
test['Id']=test['Id'].astype(str)
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})

my_submission.to_csv('submission.csv', index=False)