In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

In [2]:
# Read the data
X = pd.read_csv('./input/train.csv', index_col='Id') 
X_test = pd.read_csv('./input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

In [3]:
object_cols = [col for col in X.columns if X[col].dtype == "object"]
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X[col].nunique() < 10]
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

nullCol=[col for col in X_test.columns if X_test[col].isnull().any()]
for col in nullCol: 
    if X_test[col].dtype=='object':
        most_frequent=max(set(list(X[col])),key=list(X[col]).count) 
        X_test[col]=X_test[col].fillna(most_frequent)
    else:
        X_test[col]=X_test[col].fillna(X[col].median())

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(one_hot_encoder.transform(X_test[low_cardinality_cols]))

OH_cols_train.index = X.index
OH_cols_test.index = X_test.index

num_X_train = X.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

OH_X_train = pd.concat([OH_cols_train,num_X_train], axis=1)
OH_X_test = pd.concat([OH_cols_test,num_X_test], axis=1)

Categorical columns that will be one-hot encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Exterior2nd', 'Neighborhood', 'Exterior1st']


In [4]:
model=XGBRegressor(n_estimators=200,learning_rate=0.005)
model.fit(OH_X_train,y)
preds_test=model.predict(OH_X_test)
output = pd.DataFrame({'Id': X_test.index,'SalePrice': preds_test})
output.to_csv('submission1.csv', index=False)
print("SUCCESS!")

SUCCESS!
