In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')

In [None]:
train_data.head()

In [None]:
train_data.columns

In [None]:
train_col_null = train_data.columns[train_data.isnull().any() == True].tolist()

train_data[train_col_null].isnull().sum()

In [None]:
test_data.head()

In [None]:
test_data.columns

In [None]:
test_col_null = test_data.columns[test_data.isnull().any() == True].tolist()
test_data[test_col_null].isnull().sum()

In [None]:
X = train_data.dropna(axis=0, subset=['SalePrice'])
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

In [None]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']

In [None]:
low_cardinality_cols

In [None]:
numeric_col = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [None]:
numeric_col

In [None]:
my_cols = low_cardinality_cols + numeric_col
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = test_data[my_cols].copy()

In [None]:
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)

In [None]:
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)

In [None]:
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [None]:
xgb = XGBRegressor(n_estimators=2000,
                   learning_rate=0.05)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
y_pred = xgb.predict(X_valid)

In [None]:
mae = mean_absolute_error(y_pred, y_valid)
print(mae)

In [None]:
prediction = xgb.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice':prediction})
output.to_csv('submission.csv', index=False)
output.head()