In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy.stats import boxcox
from sklearn.metrics import mean_squared_error as mse
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib import cm
from sklearn import ensemble

In [None]:
# Import data
train = pd.read_csv('train.csv', index_col = 0)
test = pd.read_csv('test.csv', index_col = 0)

#get info

def get_details(data):
    skew= data.skew()
    corr = data.corr()['SalePrice']

    nulls = data.apply(lambda x: x.isnull().sum())
    nulls_perc = data.apply(lambda x: x.isnull().sum()/data.shape[0]*100)
    unique = data.apply(lambda x: [x.unique()])

    details = pd.concat([skew, corr, nulls, nulls_perc, unique], axis=1, sort=False)
    details.columns = ['skew', 'corr', 'nulls', 'nulls_perc', 'unique']
    
    return details
    
def get_features(model, x, y):
    
    model.fit(x, y)
    feature_importance = {}

    for i, col in enumerate(x.columns):
        feature_importance[col] = model.feature_importances_[i]

    return {k: v for k, v in sorted(feature_importance.items(), key=lambda item: item[1],reverse=True)}

def lift_table(actual, pred, weight=None, n=10, xlab='Predicted Decile', MyTitle='Model Performance Lift Chart'):

    if weight is None:
        weight=np.ones((1,len(actual)))

    pdf= pd.DataFrame(sp.vstack([actual,pred,weight]).T,columns=['Actual','Predicted','Weight'],)
    pdf= pdf.sort_values(by='Predicted')

    pdf['CummulativeWeight'] = np.cumsum(pdf['Weight'].astype(float))
    pdf['CummulativeWeightedActual'] = np.cumsum(pdf['Actual']*pdf['Weight'])

    TotalWeight = sum(pdf['Weight'])

    pdf['PredictedDecile'] = np.round(pdf['CummulativeWeight']*n /TotalWeight + 0.5,decimals=0)
    pdf['PredictedDecile'][pdf['PredictedDecile'] < 1.0] = 1.0
    pdf['PredictedDecile'][pdf['PredictedDecile'] > n] = n
    pdf['WeightedPrediction'] = pdf['Predicted']*pdf['Weight']
    pdf['WeightedActual'] = pdf['Actual']*pdf['Weight']

    lift_df = pdf.groupby('PredictedDecile').agg({'WeightedPrediction': np.sum,'Weight':np.sum,'WeightedActual':np.sum,'PredictedDecile':np.size})

    lift_df['AveragePrediction'] = lift_df['WeightedPrediction']/lift_df['Weight']
    lift_df['AverageActual'] = lift_df['WeightedActual']/lift_df['Weight']
    lift_df['AverageError'] = lift_df['AverageActual']/lift_df['AveragePrediction']

    return lift_df

def plot_lift(lift_df):

    n=lift_df.shape[0]
    d = pd.DataFrame(lift_df.index)
    p = list(lift_df['AveragePrediction'])#p = list(lift_df['AveragePredictionBalanced'])
    a = list(lift_df['AverageActual'])

    mean_actual=np.mean(a)
    p.reverse()
    a.reverse()

    lift=a[0]/mean_actual

    plt.plot(d,p,label='Predicted',color='blue',marker='o')
    plt.plot(d,a,label='Actual',color='red',marker='d')
    plt.plot(range(1,n+1),[mean_actual]*n,'--',label='Mean',color='k',marker=None)
    plt.legend(['Predicted','Actual','Mean'])
    plt.xlabel('decile')
    plt.ylabel('Actual vs. Predicted')
    plt.text(n-2,1.2*mean_actual, 'lift = {0:.2f} '.format(lift))
    plt.show()

cat = train.select_dtypes(include=['O']).columns
cont = train.select_dtypes(exclude=['O']).columns

details = get_details(train[cont]).sort_values('corr', ascending = False).iloc[1:].head(12)

train = train.loc[train.GrLivArea <= 4500]
train = train.loc[train.TotalBsmtSF < 6000]
train = train.loc[train['1stFlrSF'] < 4000]

data = pd.concat([train, test], sort=False).reset_index(drop=True)

#impute NaN

data['PoolQC'].loc[data.PoolQC.isnull() & data.PoolArea == 0] = 'NA'
data['MiscFeature'].loc[data.MiscFeature.isnull() & (data.MiscVal == 0)] = 'NA'
data['Alley'].loc[data.Alley.isnull()] = 'NA'
data['Fence'].loc[data.Fence.isnull()] = 'NA'
data['FireplaceQu'].loc[data.FireplaceQu.isnull()] = 'NA'
data['LotFrontage'].loc[data.LotFrontage.isnull()] = 0

fill = pd.Series([data[c].value_counts().index[0] for c in data[cat]],
            index=data[cat].columns)

data[cat] = data[cat].fillna(fill)

details = get_details(data)

fill = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2','BsmtUnfSF', 'TotalBsmtSF', 'GarageArea']

for i in fill:
    data.loc[data[i].isnull(), i] = data.groupby(['OverallQual', 'Neighborhood'])[i].transform(lambda x: x.fillna(x.mean()))
    
fill = ['BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageYrBlt']

for i in fill:
    data.loc[data[i].isnull(), i] = data.groupby(['OverallQual', 'Neighborhood'])[i].transform(lambda x: x.fillna(x.median()))
    
data.loc[data['GarageYrBlt'].isnull(), 'GarageYrBlt'] = data.GarageYrBlt.median()
    
data = data.drop(['Utilities'], axis = 1)
    
cat = cat.to_list()
cat.remove('Utilities')

dummy = pd.get_dummies(data[cat])

data = pd.concat([data, dummy], axis =1)
data = data.drop(cat, axis = 1)

data.loc[data['SalePrice'].isnull(), 'SalePrice'] = 0

details = get_details(data[cont]).sort_values('skew', ascending = False)

no_log = data

cols = details.loc[abs(details['skew']) > 0.7].index.to_list()
cols.remove('SalePrice')

# Boxcox
data.loc[:, 'SalePrice'], l_opt = boxcox(data.loc[:, 'SalePrice'] + 1) 

data = data.drop('MiscVal', axis = 1)
data_copy = data

#go back to train and test
train  = data.loc[data.SalePrice > 0,]
test = data.loc[data.SalePrice == 0,]

x = train.drop('SalePrice', axis = 1)
y = train['SalePrice']

#pvalues

results = sm.OLS(y, x).fit()
pValues = results.pvalues

# pValues = list(pValues[pValues<0.05].index)
# x = x[pValues]
r2 = pd.DataFrame(columns= ['r2_train', 'r2_test', 'MSE'])

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)

Y_train = Y_train.astype('int32')
Y_test = Y_test.astype('int32')

In [None]:
# # xgboost
xgb_model = xgb.XGBClassifier()
# get_features(xgb_model, X_train, Y_train)

cols = ['ExterQual_TA', 'OverallQual', 'GarageFinish_Unf', 'KitchenQual_TA', 'LotShape_IR1',
         'GrLivArea', 'KitchenQual_Gd', 'TotalBsmtSF', 'GarageType_Detchd', 'BsmtFinSF1']

grid_para_tree = [{
    "booster": ["gbtree", "gblinear"],
    'learning_rate': np.linspace(0.1, 1, 10),
    'gamma': range(0, 5),
    'min_child_weight':range(1, 6),
    "n_estimators": np.linspace(100, 200, 5),
    "random_state": [42]
}]
# trainX = X_train[cols].astype('int32')
# testX = X_test[cols].astype('int32')

grid_search_tree = GridSearchCV(xgb_model, grid_para_tree, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_tree.fit(X_train[cols], Y_train)

print(grid_search_tree.best_params_)
print(grid_search_tree.best_score_)
print(grid_search_tree.score(X_train[cols], Y_train))
print(grid_search_tree.score(X_test[cols], Y_test))