In [None]:
from subprocess import check_output
print('Contents inside the directory')
print(check_output(["ls","../input/boston-house-prices/"]).decode('utf8'))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
sns.set(style='whitegrid')
from scipy.stats import skew,norm,probplot
from scipy.special import boxcox1p
from sklearn.preprocessing import RobustScaler,StandardScaler,PowerTransformer
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin,RegressorMixin,BaseEstimator,clone
from sklearn.metrics import mean_squared_error

In [None]:
df=pd.read_csv('../input/boston-house-prices/housing.csv',delim_whitespace=True)

In [None]:
df.head()

In [None]:
cols=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B-1000','LSTAT','MEDV']
df.columns=cols

In [None]:
df.head()

In [None]:
df_describe=df.describe()

In [None]:
df_describe.index

In [None]:
index=['count','mean','std','min','max','25%','50%','75%']
df_describe.reindex(index=index).T

In [None]:
#Distribution of Y values
plt.figure(figsize=(10,10))
sns.distplot(df['MEDV'],fit=norm)
(mu,sigma)=norm.fit(df['MEDV'])
plt.legend(["Normal dist ($\mu=${:.4f} and $\sigma=${:.4f})".format(mu,sigma)])

In [None]:
fig=plt.figure()
res=probplot(df['MEDV'],plot=plt)
plt.show()

In [None]:
df['MEDV']=np.log1p(df['MEDV'])

In [None]:
sns.distplot(df['MEDV'])
(mu,sigma)=norm.fit(df['MEDV'])
plt.legend(['Normal distribution($\mu=${:.4f} and $\sigma=${:.4f})'.format(mu,sigma)])

In [None]:
fig=plt.figure()
probplot(df['MEDV'],plot=plt)
plt.show()

In [None]:
#X=df.drop('MEDV',axis=1)
#y=df['MEDV']

In [None]:
plt.figure(figsize=(13,13))
corr_matrix=df.corr().sort_values(by='MEDV',ascending=False)
sns.heatmap(corr_matrix,cmap='YlGnBu',annot=True)

In [None]:
sns.scatterplot(x='CRIM',y='MEDV',data=df)

In [None]:
sns.scatterplot(x='AGE',y='MEDV',data=df)

In [None]:
plt.hist(x='AGE',data=df)

In [None]:
sns.scatterplot(x='RM',y='MEDV',data=df)

In [None]:
sns.scatterplot(x='DIS',y='MEDV',data=df)

In [None]:
plt.hist('DIS',data=df)

In [None]:
sns.scatterplot(x='NOX',y='MEDV',data=df)

In [None]:
sns.boxplot(y='AGE',data=df)

In [None]:
num_features=df.dtypes[df.dtypes!='object'].index
num_features
skewed_features=df[num_features].apply(lambda x:skew(x.dropna())).sort_values(ascending=False)


In [None]:
print(skewed_features)
skew=pd.DataFrame({"Skewed Features":skewed_features})

In [None]:
skew.head()

In [None]:
pt=PowerTransformer(method='yeo-johnson')
pt.fit(skew)
print(pt.lambdas_)

In [None]:
skew=skew[abs(skew)>0.75]
skew_feature=skew.index
print(skew_feature)

In [None]:
lam=0.86
for f in skew_feature:
    df[f]=boxcox1p(df[f],lam)

In [None]:
X=df.drop('MEDV',axis=1)
y=df['MEDV']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.88,random_state=100)

In [None]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)

In [None]:
X_test=sc.transform(X_test)

In [None]:
X_train=pd.DataFrame(X_train,columns=X.columns)

In [None]:
X_train

In [None]:
X_test=pd.DataFrame(X_test,columns=X.columns)

In [None]:
print("\n")
print(X_train.head())
print("\n")
print("----------------")
print("\n")
print(X_test.head())

In [None]:
#Modelling the data
def rmse_cv(model):
    n_folds=5
    kf=KFold(n_splits=n_folds,random_state=100,shuffle=True).get_n_splits(X_train.values)
    rmse=np.sqrt(-cross_val_score(model,X_train,y_train,scoring='neg_mean_squared_error',cv=kf))
    return (rmse)
    

In [None]:
#Lasso Regression
model_lasso=make_pipeline(RobustScaler(),Lasso(alpha=0.0005,random_state=1))


In [None]:
#ElasticNet Regression
model_enet=make_pipeline(RobustScaler(),ElasticNet(alpha=0.0005,l1_ratio=0.8,random_state=1))

In [None]:
model_xgb=xgb.XGBRegressor(n_estimators=100,max_depth=5,learning_rate=0.0001,verbosity=1,min_child_weight=0.0468,max_delta_step=0.0812)


In [None]:
model_lgbm=lgb.LGBMRegressor(objective='Regression',learning_rate=0.0001,min_child_weight=0.005,n_estimators=800,random_state=800)

In [None]:
score=rmse_cv(model_lasso)
print('Lasso has mean {:.4f} and standard deviation {:.4f}'.format(score.mean(),score.std()))

In [None]:
score=rmse_cv(model_enet)
print('Elasticet has mean {:.4f} and standard deviation {:.4f}'.format(score.mean(),score.std()))

In [None]:
score=rmse_cv(model_xgb)
print('XGBoost has mean {:.4f} and standard deviation {:.4f}'.format(score.mean(),score.std()))

In [None]:
score=rmse_cv(model_lgbm)
print('LightBoost has mean {:.4f} and standard deviation {:.4f}'.format(score.mean(),score.std()))

In [None]:
#Stacking the model function
class AveragingModels(BaseEstimator,RegressorMixin,TransformerMixin):
    def __init__(self,models):
        self.models=models
    def fit(self,X,y):
        self.models_=[clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X,y)
        return self
    def predict(self,X):
        predictions=np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions,axis=1)
        

In [None]:
averaged_models=AveragingModels(models=(model_lasso,model_enet,model_xgb,model_lgbm))
score=rmse_cv(averaged_models)
print(f'Score mean is {np.round(score.mean(),4)} and Standard Deviation is {np.round(score.std(),4)}')

In [None]:
def rmse_predictions(y_actual,y_predicted):
    return np.sqrt(mean_squared_error(y_actual,y_predicted))

In [None]:
averaged_models.fit(X_train.values,y_train)
stacked_models_predictions_train=averaged_models.predict(X_train.values)
stacked_models_predictions_test=averaged_models.predict(X_test.values)
print(rmse_predictions(y_train,stacked_models_predictions_train))
print(rmse_predictions(y_test,stacked_models_predictions_test))

In [None]:
predicted_vs_actual=pd.DataFrame({'Actual_values':y_test,'Predicted_values':stacked_models_predictions_test})
predicted_vs_actual