In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy.stats import boxcox
from sklearn.metrics import mean_squared_error as mse
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib import cm
from sklearn import ensemble

In [None]:
# Import data
train = pd.read_csv('train.csv', index_col = 0)
test = pd.read_csv('test.csv', index_col = 0)

#get info

def get_details(data):
    skew= data.skew()
    corr = data.corr()['SalePrice']

    nulls = data.apply(lambda x: x.isnull().sum())
    nulls_perc = data.apply(lambda x: x.isnull().sum()/data.shape[0]*100)
    unique = data.apply(lambda x: [x.unique()])

    details = pd.concat([skew, corr, nulls, nulls_perc, unique], axis=1, sort=False)
    details.columns = ['skew', 'corr', 'nulls', 'nulls_perc', 'unique']
    
    return details

cat = train.select_dtypes(include=['O']).columns
cont = train.select_dtypes(exclude=['O']).columns

details = get_details(train[cont]).sort_values('corr', ascending = False).iloc[1:].head(12)

train = train.loc[train.GrLivArea <= 4500]
train = train.loc[train.TotalBsmtSF < 6000]
train = train.loc[train['1stFlrSF'] < 4000]

data = pd.concat([train, test], sort=False).reset_index(drop=True)

#impute NaN

data['PoolQC'].loc[data.PoolQC.isnull() & data.PoolArea == 0] = 'NA'
data['MiscFeature'].loc[data.MiscFeature.isnull() & (data.MiscVal == 0)] = 'NA'
data['Alley'].loc[data.Alley.isnull()] = 'NA'
data['Fence'].loc[data.Fence.isnull()] = 'NA'
data['FireplaceQu'].loc[data.FireplaceQu.isnull()] = 'NA'
data['LotFrontage'].loc[data.LotFrontage.isnull()] = 0

fill = pd.Series([data[c].value_counts().index[0] for c in data[cat]],
            index=data[cat].columns)

data[cat] = data[cat].fillna(fill)

details = get_details(data)

fill = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2','BsmtUnfSF', 'TotalBsmtSF', 'GarageArea']

for i in fill:
    data.loc[data[i].isnull(), i] = data.groupby(['OverallQual', 'Neighborhood'])[i].transform(lambda x: x.fillna(x.mean()))
    
fill = ['BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageYrBlt']

for i in fill:
    data.loc[data[i].isnull(), i] = data.groupby(['OverallQual', 'Neighborhood'])[i].transform(lambda x: x.fillna(x.median()))
    
data.loc[data['GarageYrBlt'].isnull(), 'GarageYrBlt'] = data.GarageYrBlt.median()
    
data = data.drop(['Utilities'], axis = 1)
    
cat = cat.to_list()
cat.remove('Utilities')

dummy = pd.get_dummies(data[cat])

data = pd.concat([data, dummy], axis =1)
data = data.drop(cat, axis = 1)

data.loc[data['SalePrice'].isnull(), 'SalePrice'] = 0

details = get_details(data[cont]).sort_values('skew', ascending = False)

no_log = data

cols = details.loc[abs(details['skew']) > 0.7].index.to_list()
cols.remove('SalePrice')

# Boxcox
x = data[cols]
l_opt = {}

for i in cols:
    x.loc[:, i], l_opt[i] = boxcox(x.loc[:, i] + 1)

data.loc[:, 'SalePrice'], l_opt = boxcox(data.loc[:, 'SalePrice'] + 1) 
    
data[cols] = x[cols]

data = data.drop('MiscVal', axis = 1)
data_copy = data

#go back to train and test
train  = data.loc[data.SalePrice > 0,]
test = data.loc[data.SalePrice == 0,]

x = train.drop('SalePrice', axis = 1)
y = train['SalePrice']

#pvalues

results = sm.OLS(y, x).fit()
pValues = results.pvalues

# pValues = list(pValues[pValues<0.05].index)
# x = x[pValues]

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)

In [None]:
from sklearn import tree

tree_model = tree.DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, Y_train)
print('train tree: ', tree_model.score(X_train, Y_train))
print('test tree: ', tree_model.score(X_test, Y_test))