In [None]:
# Octopus ML pakage - github.com/gershonc/octopus-ml
!pip install octopus-ml 

In [None]:
import warnings
import seaborn as sns 
import matplotlib.pyplot as plt
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import tracemalloc
from pandas_summary import DataFrameSummary
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import lightgbm as lgb
from tqdm import tqdm
#check out https://github.com/gershonc/octopus-ml
import octopus_ml as oc

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199
%matplotlib inline
warnings.simplefilter("ignore")


In [None]:
train_df = pd.read_csv ( "../input/house-prices-advanced-regression-techniques/train.csv")
test_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

## EDA

In [None]:
train_df.head(4)

In [None]:
# Data shape 
print ("Train set: ",train_df.shape)
print ("Test set: ",test_df.shape)

In [None]:
# DataFrane Summary by pandas summary package (extension of pandas.describe method) 
dfs = DataFrameSummary(train_df)
dfs.summary()

In [None]:
# Top 20 features with missing data

sns.set_style("whitegrid")
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,4))
df=pd.Series(1 - train_df.count() / len(train_df)).sort_values(ascending=False).head(20)
sns.barplot(x=df.index, y=df,palette="Blues_d")
plt.xticks(rotation=90)


In [None]:
train_df['YrSold'].value_counts()

In [None]:
grp_year=train_df.groupby('YrSold')
plt.figure(figsize=(5,3))

df_years=grp_year['SalePrice'].mean().reset_index()
sns.barplot(x=df_years.YrSold, y=df_years['SalePrice'],palette="Blues_d")
plt.xticks(rotation=0)

In [None]:
from scipy.stats import norm, skew 
sns.set_style("whitegrid")
plt.figure(figsize=(12,4))

sns.distplot(train_df['SalePrice'] , fit=norm);
(mu, sigma) = norm.fit(train_df['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='upper right')

ax = plt.axes()
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

In [None]:
# Categorical features

categorical_features=[]
for c in train_df.columns:
    col_type = train_df[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        train_df[c] = train_df[c].astype('category')
        categorical_features.append(c)
print (categorical_features)

### Correlations to target (Sale price)

In [None]:
import seaborn as sns
import scipy.stats as stats
sns.set_style("whitegrid")


j = sns.jointplot(x=train_df['OverallQual'],y=train_df['SalePrice'],data=train_df, kind='reg', height=6)
#j.annotate(stats.pearsonr)
plt.show()
print ("Pearson | P-value: "+str(stats.pearsonr(train_df['OverallQual'], y=train_df['SalePrice'])))

In [None]:
sns.set_style("whitegrid")
j = sns.jointplot(x=train_df['TotalBsmtSF'],y=train_df['SalePrice'],data=train_df, kind='reg', height=6)
#j.annotate(stats.pearsonr)
plt.show()
print ("Pearson | P-value: "+str(stats.pearsonr(train_df['TotalBsmtSF'], y=train_df['SalePrice'])))

In [None]:
sns.set_style("whitegrid")
j = sns.jointplot(x=train_df['GrLivArea'],y=train_df['SalePrice'],data=train_df, kind='reg', height=6)
#j.annotate(stats.pearsonr)
plt.show()
print ("Pearson: "+str(round(stats.pearsonr(train_df['GrLivArea'], y=train_df['SalePrice'])[0],4))+"| P-value: "+ str(stats.pearsonr(train_df['GrLivArea'], y=train_df['SalePrice'])[1]))

## Data pre-processing


In [None]:
train_df['SalePrice'] = np.log1p(train_df["SalePrice"])
y = train_df['SalePrice']
test_id = test_df['Id']
data = pd.concat([train_df, test_df], axis=0, sort=False)
data = data.drop(['Id', 'SalePrice'], axis=1)

In [None]:
numerical_feat = ['BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFinSF1', 'GarageArea']
categorical_feat = ['MSZoning', 'Utilities', 'Functional', 'KitchenQual', 'Exterior2nd', 'Electrical', 'Exterior1st', 'SaleType']

In [None]:
for cat_feat in categorical_feat:
    data[cat_feat] = data[cat_feat].fillna(str(data[cat_feat][:len(train_df)].value_counts().index[0]))

In [None]:
for num_feet in numerical_feat:
    data[num_feet] = data[num_feet].fillna(data[num_feet][:len(train_df)].mean())

In [None]:
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
data['SumOverAll'] = data['OverallQual'] + data['OverallCond']

In [None]:
numeric_feats = data.dtypes[data.dtypes != 'object'].index
skewed_feats = data[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_feats[abs(skewed_feats) > 0.5]
high_skew

In [None]:
for feat in high_skew.index:
    data[feat] = np.log1p(data[feat])

In [None]:
train = data[:len(train_df)]
test = data[len(train_df):]

In [None]:
features=train.columns.to_list()
print ('Number of features ', len(features))

In [None]:
"""from sklearn.impute import SimpleImputer
numerical_cols= train_df.select_dtypes(exclude='object')
numerical_cols
imp_mean=SimpleImputer(strategy='median')
imp_mean.fit(numerical_cols)
imp_mean.transform(numerical_cols)"""

In [None]:
# Categorical features

categorical_features=[]
for c in train.columns:
    col_type = train[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        train[c] = train[c].astype('category')
        categorical_features.append(c)
print (categorical_features)
X=train

## Octopus ML - regression model adjusments

In [None]:
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
#"num_leaves":9,
#"learning_rate":0.05, 
#"n_estimators":100,
#"max_bin":55, 
#"bagging_fraction":0.8,
#"bagging_freq":5, 
#"feature_fraction":0.2319,
#"feature_fraction_seed":9, 
#"bagging_seed":9,
#"min_data_in_leaf":6, 
#"min_sum_hessian_in_leaf":11, 
#"n_jobs":-1
}

metrics= oc.cv_adv(X,y,0.5,100,shuffle=True,params=params, mode="regression") 

## Models performance evaluation - preds vs real 

In [None]:
preds_real = list(zip(np.expm1(metrics['predictions_proba'][0:1000]),np.expm1(metrics['y'][0:1000])))
df = pd.DataFrame(preds_real, columns=['Preds','Real'])
df.head(10)

In [None]:
df.boxplot(column=['Preds', 'Real'])

In [None]:
dfs = DataFrameSummary(df)
dfs.summary()

In [None]:
from sklearn import metrics as metric
#print('Mean Squared Error:', metric.mean_squared_error(np.expm1(metrics['y']), np.expm1(metrics['predictions_proba'])))
print('Root Mean Squared Error:', np.sqrt(metric.mean_squared_error(np.expm1(metrics['y']), np.expm1(metrics['predictions_proba']))))

In [None]:
# Categorical features
test_df=test
categorical_features=[]
for c in test_df.columns:
    col_type = test_df[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        test_df[c] = test_df[c].astype('category')
        categorical_features.append(c)
print (categorical_features)

result = pd.DataFrame(test_id, columns = ['Id'])
test_pre = np.expm1(metrics['final_clf'].predict(test_df))
result['SalePrice'] = test_pre

result.to_csv("lgb_result_updated.csv", index = False, header = True)
#result.to_csv("lgb_result2.csv")

In [None]:
result.head(10)

## Stacked CV 5-folds models 

In [None]:
for i, s_clf in enumerate(metrics['stacked_models']):
    test_pre = np.expm1(s_clf.predict(test_df))
    result['clf_'+str(i)]=test_pre

In [None]:
result.head(5)

In [None]:
col = result.loc[: , "clf_0":"clf_4"]
result['sale_mean'] = col.mean(axis=1)

In [None]:
result.head(8)

In [None]:
submit = pd.DataFrame(result['Id'], columns = ['Id'])
submit['SalePrice']=result['sale_mean']
submit.to_csv('stacked_submission.csv',index = False)
submit.head(4)

## Model Explainability

### Feature importance 

In [None]:
feature_imp_list=oc.plot_imp(metrics['final_clf'],X,'LightGBM House prices Kaggle',num=30)

In [None]:
top_features=feature_imp_list.sort_values(by='Value', ascending=False).head(20)
top_features

## Please upvote if you find this notebook interesting and useful