<a href="https://colab.research.google.com/github/sanjit1995/AdvancedHousePrices-kaggle/blob/master/HousePrices_Initial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


Import train and test csv files

In [0]:
train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/data/HousePrices/train.csv")
test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/data/HousePrices/test.csv")

In [0]:
test.info()

In [0]:
train.info()

Delete all columns which have very less values or irrelevant values

In [0]:
train.drop(['Alley','GarageYrBlt','PoolQC','Fence','MiscFeature','Id'], axis=1, inplace=True)
test.drop(['Alley','GarageYrBlt','PoolQC','Fence','MiscFeature','Id'], axis=1, inplace=True)

Plot heatmap to track all columns

In [0]:
sns.heatmap(test.isnull(), yticklabels=False, cbar=False)

To check if the column has categories or numbers, if categories then use mode() or mean() for numbers

In [0]:
test['MasVnrArea'].value_counts()

As the data is a number and has a lot of values, the mean is used to replace all null data

In [0]:
def naValuesAdjustment(df, req_size):
  for i in range(0, df.columns.size):
    if df.count()[i] < req_size: 
        if df.dtypes[i] == 'object':
          df[df.columns[i]] = df[df.columns[i]].fillna(df[df.columns[i]].mode()[0])
        else:
          df[df.columns[i]] = df[df.columns[i]].fillna(df[df.columns[i]].mean())

Train data adjustment

In [0]:
naValuesAdjustment(df=train,req_size=1460)

Test data adjustment

In [0]:
naValuesAdjustment(df=test,req_size=1459)

Visualizing Correlation

In [0]:
#get correlations of each features in dataset
corrmat = train.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(25,25))
#plot heat map
g=sns.heatmap(train[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [0]:
from sklearn.preprocessing import LabelEncoder
temp_train = train.copy()
temp_train = temp_train.apply(LabelEncoder().fit_transform)
corr = temp_train.corr()
print(corr)
columns = np.full((corr.shape[0],), True, dtype=bool)
print(corr.shape)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
print(columns)
selected_columns = temp_train.columns[columns]
temp_train = temp_train[selected_columns]
print(temp_train)

In [0]:
temp_x_train = train.loc[:,train.columns != 'SalePrice']
temp_y_train = train.loc[:,train.columns == 'SalePrice']

In [0]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# using SelectKBest
bestfeatures = SelectKBest(score_func=chi2, k=30)
fit = bestfeatures.fit(temp_x_train,temp_y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(temp_x_train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(30,'Score'))

Feature Selection

In [0]:
train.loc[:,train.columns != 'SalePrice']

Combine test and training data for one-hot encoding



In [0]:
total_data = pd.concat([train,test], axis=0)

In [0]:
def onehot_encode_multicols(df, multicolumns):
  temp_df = df
  i = 0
  for field in multicolumns:
    df1 = pd.get_dummies(df[field], drop_first=True)
    df.drop([field], axis=1, inplace=True)
    if i == 0:
      temp_df = df1.copy()
    else:
      temp_df = pd.concat([temp_df, df1], axis=1)
    i += 1
  temp_df = pd.concat([df, temp_df], axis=1)
  return temp_df
    

Select columns which have categories as data and apply one-hot-encoding to them

In [0]:
categorical_cols = []
for i in range(0, total_data.columns.size):
  # print(i)
  if total_data.dtypes[i] == 'object':
    # print(total_data.columns[i])
    categorical_cols.append(total_data.columns[i])

In [0]:
categorical_cols = np.array(categorical_cols)
total_temp_data = onehot_encode_multicols(total_data, categorical_cols)

In [0]:
for i in range(0, total_temp_data.columns.size):
  if(total_temp_data.columns.duplicated()[i] == True):
    print(total_temp_data.columns[i])

Remove duplicate columns in the combined dataframe

In [0]:
total_temp_data = total_temp_data.loc[:,~total_temp_data.columns.duplicated()]

In [14]:
total_temp_data.shape

(2919, 176)

Separate the combined data into train and test data

In [0]:
train_data = total_temp_data.iloc[:1460,:]
test_data = total_temp_data.iloc[1460:,:]

In [0]:
test_data.drop(['SalePrice'], axis=1, inplace=True)

In [0]:
x_train = train_data.drop(['SalePrice'], axis=1)
y_train = train_data['SalePrice']

In [18]:
print(x_train.shape, y_train.shape)

(1460, 175) (1460,)


Hyperparameter Optimization

In [0]:
import xgboost
#classifier = xgboost.XGBRegressor()
regressor=xgboost.XGBRegressor()
# classifier.fit(x_train, y_train)

In [0]:
import pickle
filename = "temp_model_HousePrediction.pkl"
pickle.dump(regressor, open(filename, 'wb'))

In [0]:
n_estimators = [500, 700, 900, 1100, 1300]
max_depth = [2, 3, 5, 10, 15]
booster = ['gbtree','gblinear']
learning_rate = [0.05, 0.1, 0.15, 0.20]
min_child_weight = [1,2,3,4]
base_score = [0.25,0.5,0.75,1]

In [0]:
hyperparameter_grid = {
    'n_estimators' : n_estimators,
    'max_depth' : max_depth,
    'learning_rate' : learning_rate,
    'min_child_weight' : min_child_weight,
    'booster' : booster,
    'base_score' : base_score
}

In [0]:
from sklearn.model_selection import RandomizedSearchCV
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [24]:
random_cv.fit(x_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   32.1s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  5.0min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 10.8min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 15.9min finished




RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha=...
                   iid='deprecated', n_iter=50, n_jobs=4,
                   param_distributions={'base_score': [0.25, 0.5, 0.75, 1],
                                        'booster': ['g

In [25]:
random_cv.best_estimator_

XGBRegressor(base_score=0.25, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=None, n_estimators=900,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

Put the above output in the below code argument

In [0]:
regressor=xgboost.XGBRegressor(base_score=0.25, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=None, n_estimators=900,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [0]:
regressor.fit(x_train,y_train)

In [0]:
y_pred = regressor.predict(test_data)

In [33]:
y_pred

array([120798.664, 161697.34 , 186119.52 , ..., 170830.77 , 118554.76 ,
       234535.25 ], dtype=float32)

In [0]:
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/data/HousePrices/sample_submission.csv")
datasets = pd.concat([sub_df['Id'],pred], axis=1)
datasets.columns = ['Id', 'SalePrice']
datasets.to_csv('sample_submission.csv', index=False)