In [None]:
''' Various Imports '''
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.linear_model import RidgeCV,LassoCV,ElasticNetCV
from mlxtend.regressor import StackingCVRegressor

from sklearn import metrics
from scipy import stats
from sklearn.model_selection import KFold,cross_val_score

import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
import seaborn as sns


import warnings
warnings.filterwarnings(action="ignore")

In [None]:
''' Importing train and test data sets '''
house_train = pd.read_csv("train.csv")
house_test = pd.read_csv("test.csv")

In [None]:
''' Combining train and test data sets for data processing steps '''
house_data = pd.concat((house_train.loc[:,'MSSubClass':'SaleCondition'],
                       house_test.loc[:,'MSSubClass':'SaleCondition']))

In [None]:
print(house_data.shape)
house_data.head()

In [None]:
''' Performing basic EDA steps'''
# print(house_data.describe())
# print(house_data.info())
print(house_data.dtypes[house_data.dtypes == 'object'].count())
print(house_data.dtypes[house_data.dtypes != 'object'].count())

In [None]:
result = [house_data[c].value_counts() for c in list(house_data.select_dtypes(include='object').columns)]
print(result)

 ### Since there are lot of categorical features which are ordinal in nature, so we will encode them in proper order

##### Also Substituting missing values for few columns whose missing values have a predefined value

In [None]:
house_data['Street'] = house_data['Street'].map({'Grvl':1,'Pave':2})
house_data['Alley'] = house_data['Alley'].map({'Grvl':1,'Pave':2,np.nan:0})
house_data['LotShape'] = house_data['LotShape'].map({'Reg':3,'IR1':2,'IR2':1,'IR3':0})
house_data['Utilities'] = house_data['Utilities'].map({'AllPub':4,'NoSewr':3,'NoSeWa':2,'ELO':1})
house_data['LandSlope'] = house_data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1})
house_data['ExterQual'] = house_data['ExterQual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
house_data['ExterCond'] = house_data['ExterCond'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
house_data['BsmtQual'] = house_data['BsmtQual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,np.nan:0})
house_data['BsmtCond'] = house_data['BsmtCond'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,np.nan:0})
house_data['BsmtExposure'] = house_data['BsmtExposure'].map({'Gd':5,'Av':4,'Mn':3,'No':2,np.nan:1})
house_data['BsmtFinType1'] = house_data['BsmtFinType1'].map({'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,np.nan:0})
house_data['BsmtFinType2'] = house_data['BsmtFinType2'].map({'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,np.nan:0})
house_data['HeatingQC'] = house_data['HeatingQC'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
house_data['CentralAir'] = house_data['CentralAir'].map({'Y':1,'N':0})
house_data['KitchenQual'] = house_data['KitchenQual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
house_data['FireplaceQu'] = house_data['FireplaceQu'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,np.nan:0})
house_data['GarageFinish'] = house_data['GarageFinish'].map({'Fin':3,'RFn':2,'Unf':1,np.nan:0})
house_data['GarageQual'] = house_data['GarageQual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,np.nan:0})
house_data['GarageCond'] = house_data['GarageCond'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,np.nan:0})
house_data['PavedDrive'] = house_data['PavedDrive'].map({'Y':3,'P':2,'N':1})
house_data['PoolQC'] = house_data['PoolQC'].map({'Ex':4,'Gd':3,'TA':2,'Fa':1,np.nan:0})
house_data['Fence'] = house_data['Fence'].map({'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,np.nan:0})
house_data['YrSold'] = house_data['YrSold'].map({2010:5,2009:4,2008:3,2007:2,2006:1})
# house_data['MSSubClass'] = house_data['MSSubClass'].map({190:16,180:15,160:14,150:13,120:12,90:11,85:10,80:9,75:8,70:7,60:6,50:5,45:4,40:3,30:2,20:1})


In [None]:
house_data['MiscFeature'] =house_data['MiscFeature'].fillna('None')
house_data['MSSubClass'] = house_data['MSSubClass'].astype('object')
house_data['LotFrontage'] =house_data['LotFrontage'].fillna(house_data['LotFrontage'].median())

In [None]:
print(house_data.dtypes[house_data.dtypes == 'object'].count())
print(house_data.dtypes[house_data.dtypes != 'object'].count())
house_data.head()

In [None]:
''' Checking for null values now after substitution '''
print(house_data.info())

In [None]:
''' Identifying features with missing values above 15% for removal '''

drop_features = []

nulls_df = pd.DataFrame((100*house_data.isnull().sum()/len(house_data)).sort_values(ascending=False).apply(lambda x: int(float("{0:.2f}".format(x))))).reset_index() 
nulls_df = nulls_df.rename(columns= {'index':'column_name',0:'value'})
#print(nulls_df.columns)
drop_features.extend(nulls_df[nulls_df.value > 15].column_name.tolist())
print(drop_features)

In [None]:
print(nulls_df.head(15))

In [None]:
''' Identifying too skewed features (numerical) '''
numeric_feats = house_data.dtypes[house_data.dtypes != "object"].index
sk_df = pd.DataFrame({'skewness': house_data[numeric_feats].apply(lambda x: stats.skew(x.dropna()))})
sk_df = sk_df.sort_values('skewness',ascending=False)
sk_df.head(10)


In [None]:
''' BOX-COX transformation of Skewed Features'''
sk_df = sk_df[abs(sk_df) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(sk_df.shape[0]))

from scipy.special import boxcox1p
skewed_features = sk_df.index
lam = 0.15
for feat in skewed_features:
    house_data[feat] = boxcox1p(house_data[feat], lam)

In [None]:
''' Identifying Highly correlated features to remove '''
corr_remove = []
cols = house_data.columns.to_list()

In [None]:
sns.set()
plt.figure(figsize=(10,10))
co_cols = cols[:10]
co_cols.append('SalePrice')
sns.heatmap(house_data[co_cols].corr(), cmap='RdBu_r', annot=True,center=0.0)
plt.title('Correlation between 1 ~ 10th columns')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
co_cols = cols[10:20]
co_cols.append('SalePrice')
sns.heatmap(house_data[co_cols].corr(), cmap='RdBu_r', annot=True,center=0.0)
plt.title('Correlation between 10 ~ 20th columns')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
co_cols = cols[30:40]
co_cols.append('SalePrice')
sns.heatmap(house_data[co_cols].corr(), cmap='RdBu_r', annot=True,center=0.0)
plt.title('Correlation between 30 ~ 40th columns')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
co_cols = cols[20:30]
co_cols.append('SalePrice')
sns.heatmap(house_data[co_cols].corr(), cmap='RdBu_r', annot=True,center=0.0)
plt.title('Correlation between 20 ~ 30th columns')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
co_cols = cols[40:50]
co_cols.append('SalePrice')
sns.heatmap(house_data[co_cols].corr(), cmap='RdBu_r', annot=True,center=0.0)
plt.title('Correlation between 40 ~ 50th columns')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
co_cols = cols[50:60]
co_cols.append('SalePrice')
sns.heatmap(house_data[co_cols].corr(), cmap='RdBu_r', annot=True,center=0.0)
plt.title('Correlation between 50 ~ 60th columns')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
co_cols = cols[60:70]
co_cols.append('SalePrice')
sns.heatmap(house_data[co_cols].corr(), cmap='RdBu_r', annot=True,center=0.0)
plt.title('Correlation between 60 ~ 70th columns')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
co_cols = cols[70:80]
co_cols.append('SalePrice')
sns.heatmap(house_data[co_cols].corr(), cmap='RdBu_r', annot=True,center=0.0)
plt.title('Correlation between 70 ~ 80th columns')
plt.show()

In [None]:
corr = house_data.corr()
high_corr = (corr >= 0.80).astype('uint8')
plt.figure(figsize=(15,15))
sns.heatmap(high_corr, cmap='RdBu_r', annot=True, center=0.0)
plt.show()

In [None]:
''' Adding more features which have collinearity to our drop_features list '''
corr_remove.extend(['TotalBsmtSF','GarageCars','TotRmsAbvGrd','GarageYrBlt','Fireplaces','GarageQual','PoolQC','BsmtQual','BsmtFinSF1','BsmtFinSF2'])

In [None]:
drop_features = drop_features + corr_remove
print(drop_features)

In [None]:
''' Normalizing target vector '''
house_train['SalePrice'] = np.log1p(house_train['SalePrice'])
sns.distplot(house_train['SalePrice'],fit= stats.norm)

In [None]:
''' creating a new data frame without unnecessary features '''
train_data = house_data.drop(columns=drop_features,axis=1)

In [None]:
# train_data.fillna(inplace=True)
print(train_data.isna().sum().sort_values(ascending=False).head())

In [None]:
print(train_data.shape)
train_data.head()

In [None]:
train_data_dummy = pd.get_dummies(train_data,drop_first=True)

In [None]:
train_data_dummy.fillna(train_data_dummy.mean(),inplace=True)

In [None]:
train_data_dummy.shape

#### Removing those features which contains almost 100% zero values in them 

In [None]:
sparse = []

for feature in train_data_dummy.columns:
    counts = train_data_dummy[feature].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(train_data_dummy) * 100 > 99.94:
        sparse.append(feature)
        
train_data_dummy.drop(columns=sparse, inplace=True)

In [None]:
# train_data_dummy = np.log1p(train_data_dummy)

In [None]:
print(train_data_dummy.shape)
train_data_dummy.head()

In [None]:
print(train_data_dummy.isna().sum().sort_values(ascending=False))

In [None]:
#creating matrices for sklearn:
x_train = train_data_dummy.iloc[:house_train.shape[0]].values
x_test = train_data_dummy.iloc[house_train.shape[0]:].values
y_train = house_train.SalePrice.values

In [None]:
print(type(y_train))

In [None]:
''' Outliers Detection and removal '''
q1 = train_data_dummy.quantile(0.25)
q3 = train_data_dummy.quantile(0.75)
iqr = q3 - q1

train_data_final = train_data_dummy[((train_data_dummy >= (q1 - 1.5*iqr)) & (train_data_dummy <= (q3 + 1.5*iqr))).any(axis=1)]

In [None]:
print(train_data_final.shape)

train_data_final.head()

In [None]:
#creating matrices for sklearn:
x_train = train_data_final.iloc[:house_train.shape[0]].values
x_test = train_data_final.iloc[house_train.shape[0]:].values
y_train = house_train.SalePrice

In [None]:
print(len(house_train.SalePrice))
print(len(x_train))

In [None]:
def rmse(y_train, y_pred):
     return np.sqrt(metrics.mean_squared_error(y_train, y_pred))

K = 10    
kf = KFold(n_splits=K, shuffle=True, random_state=42)

In [None]:
ridge = make_pipeline(RobustScaler(),RidgeCV(alphas=np.arange(14.5, 15.6, 0.1), cv=kf))

ridge.fit(x_train,y_train)

In [None]:
lasso = make_pipeline(RobustScaler(),LassoCV(alphas=np.arange(0.0001, 0.0009, 0.0001), random_state=42, cv=kf))

lasso.fit(x_train,y_train)

In [None]:
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(alphas=np.arange(0.0001, 0.0008, 0.0001),
                                                        l1_ratio=np.arange(0.8, 1, 0.025), cv=kf))

elasticnet.fit(x_train,y_train)

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.01,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)


GBoost.fit(x_train,y_train)

In [None]:
xgb_regressor = XGBRegressor(learning_rate=0.01,
                    n_estimators=3500,
                    max_depth=3,
                    gamma=0.001,
                    subsample=0.7,
                    colsample_bytree=0.7,
                    objective='reg:squarederror',
                    nthread=-1,
                    seed=42,
                    reg_alpha=0.0001)

xgb_regressor.fit(x_train,y_train)

# y_train_pred_xgb = xgb_regressor.predict(x_train)

# print("RMSLE score for XGB :",(np.sqrt(metrics.mean_squared_log_error(y_train,y_train_pred_xgb))))

# xgb_preds = xgb_regressor.predict(x_test)


In [None]:
''' When we have normalised the target value as well ,perform this step '''
xgb_preds_final = np.expm1(xgb_preds)
print(xgb_preds_final)

In [None]:
lgbmr = lgb.LGBMRegressor(objective='regression', 
                      num_leaves=4,
                      learning_rate=0.01, 
                      n_estimators=5000,
                      max_bin=200, 
                      bagging_fraction=0.75,
                      bagging_freq=5, 
                      bagging_seed=42,
                      feature_fraction=0.2,
                      feature_fraction_seed=42,
                      verbose=0)

lgbmr.fit(x_train,y_train)

# y_actual_pred = lgbmr.predict(x_train)
# print("RMSLE score for LGBM :",(np.sqrt(metrics.mean_squared_log_error(y_train,y_actual_pred))))

# y_preds_lgbm = lgbmr.predict(x_test)

In [None]:
stack = StackingCVRegressor(regressors=(ridge, lasso,elasticnet, GBoost,xgb_regressor, lgbmr), meta_regressor=xgb_regressor,
                            use_features_in_secondary=True,random_state=42)

In [None]:
stack.fit(x_train,y_train)

# y_preds_stack = stack.predict(x_test)

In [None]:
def comb_predict(X):
    return ((0.05 * lasso.predict(X)) +
            (0.1 * ridge.predict(X)) +
            (0.1 * elasticnet.predict(X)) +
            (0.1 * GBoost.predict(X)) +
            (0.25 * xgb_regressor.predict(X)) +
            (0.15 * lgbmr.predict(X)) +
            (0.3 * stack.predict(X)))

In [None]:
training_score_comb = rmse(y_train, comb_predict(x_train))
print(training_score_comb)

In [None]:
''' When we have normalised the target value as well ,perform this step '''
y_preds_comb = blend_predict(x_test)
y_preds_final = np.expm1(y_preds_comb)
print(y_preds_final[:11])

In [None]:
solution = pd.DataFrame({"id":house_test.Id, "SalePrice":y_preds_final})
solution.to_csv("sachin_solution.csv", index = False)

In [None]:
print(y_preds)