In [None]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as  plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.pipeline import make_pipeline
from xgboost.sklearn import XGBRegressor

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
houses=pd.read_csv("../input/house-prices-dataset/train.csv")
houses.head()

In [None]:
houses_test = pd.read_csv("../input/house-prices-dataset/test.csv")
#there is no "SalePrice" column here which is our target varible.

In [None]:
houses.shape

In [None]:
houses.columns

In [None]:
houses.info()

In [None]:
column_data_type = []
for col in houses.columns:
    data_type = houses[col].dtype
    if houses[col].dtype in ['int64','float64']:
        column_data_type.append('numeric')
    else:
        column_data_type.append('categorical')
        
plt.figure(figsize=(4,4))
sns.countplot(x=column_data_type)
plt.show()

In [None]:
#for numeric variables
houses.describe()

In [None]:
y = houses['SalePrice'].values
data = pd.concat([houses,houses_test],axis=0,sort=False)

# MISSING VALUE IMPUTATION 

In [None]:
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending = False)

NAN_col = list(missing_values.to_dict().keys())
missing_values_data = pd.DataFrame(missing_values)
missing_values_data.reset_index(level=0, inplace=True)
missing_values_data.columns = ['Feature','Number of Missing Values']
missing_values_data['Percentage of Missing Values'] = (100.0*missing_values_data['Number of Missing Values'])/len(data)
missing_values_data

In [None]:
null_columns=data.columns[data.isnull().any()]
labels = []
values = []

for col in null_columns:
    labels.append(col)
    values.append(data[col].isnull().sum())
    ind = np.arange(len(labels))
    
width = 0.9
fig, ax = plt.subplots(figsize=(10,12))
rects = ax.barh(ind, np.array(values), color='orange')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(labels, rotation='horizontal')
ax.set_xlabel("MISSING VALUES COUNT")
ax.set_ylabel("COLUMN")
ax.set_title("COLUMNS WITH MISSING VALUES");

In [None]:
data['BsmtFinSF1'].fillna(0, inplace=True)
data['BsmtFinSF2'].fillna(0, inplace=True)
data['TotalBsmtSF'].fillna(0, inplace=True)
data['BsmtUnfSF'].fillna(0, inplace=True)
data['KitchenQual'].fillna('TA',inplace=True)
data['LotFrontage'].fillna(data.groupby('1stFlrSF')['LotFrontage'].transform('mean'),inplace=True)
data['LotFrontage'].interpolate(method='linear',inplace=True)

In [None]:
sns.boxplot("Electrical","SalePrice",data=data)
plt.title("Electrical Vs SalePrice ")
plt.ylabel("SalePrice")
plt.xlabel("Electrical");

In [None]:
#We can replace missing values with most frequent ones.
data["Electrical"] = data["Electrical"].fillna('SBrkr')

In [None]:

#All missing value indicate that particular house doesn't have an alley access.we can replace it with 'None'.
data["Alley"] = data["Alley"].fillna('None')

#If fireplace quality is missing that means that house doesn't have a fireplace
data["FireplaceQu"] = data["FireplaceQu"].fillna('None')


garage_cols=['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish','GarageCars','GarageArea']
for col in garage_cols:
    if data[col].dtype==np.object:
        data[col] = data[col].fillna('None')
    else:
        data[col] = data[col].fillna(0)
        
        
#If PoolArea is 0, that means that house doesn't have a pool.
#So we can replace PoolQuality with None.
data["PoolQC"] = data["PoolQC"].fillna('None')

data['SqrtLotArea']=np.sqrt(data['LotArea'])
filter = data['LotFrontage'].isnull()
data.LotFrontage[filter]=data.SqrtLotArea[filter]

data["MasVnrArea"] = data["MasVnrArea"].fillna(0.0)

data["MasVnrType"] = data["MasVnrType"].fillna('None')

#Some houses don't have miscellaneous features like shed, Tennis court etc..
data["MiscFeature"] = data["MiscFeature"].fillna('None')

#Fence has got 1179 null values. We can safely assume that those houses doesn't have a Fence and replace those values with None.
data["Fence"] = data["Fence"].fillna('None')

In [None]:
#for basement for those houses. we can replace it with 'None'.
basement_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2']
for col in basement_cols:
    if 'FinSF'not in col:
        data[col] = data[col].fillna('None')

data['MSZoning'].fillna('RL',inplace=True)
data['Utilities'].fillna('AllPub',inplace=True)
data['Exterior1st'].fillna('VinylSd',inplace=True)
data['Exterior2nd'].fillna('VinylSd',inplace=True)
data['BsmtFullBath'].fillna(0,inplace=True)
data['BsmtHalfBath'].fillna(0,inplace=True)
data['Functional'].fillna('Typ',inplace=True)
data['SaleType'].fillna('Oth',inplace=True)

In [None]:
#removing missing values for saleprice
data.dropna(inplace=True)

In [None]:
data[null_columns].isnull().sum()

In [None]:
data['Total_Square_Feet'] = (data['BsmtFinSF1'] + data['BsmtFinSF2'] + data['1stFlrSF'] + 
                                                                 data['2ndFlrSF'] + data['TotalBsmtSF'])

data['Total_Bath'] = (data['FullBath'] + (0.5 * data['HalfBath']) + data['BsmtFullBath'] + 
                                                                  (0.5 * data['BsmtHalfBath']))

data['Total_Porch_Area'] = (data['OpenPorchSF'] + data['3SsnPorch'] + 
                                                data['EnclosedPorch'] + data['ScreenPorch'] + data['WoodDeckSF'])

data['SqFtPerRoom'] = data['GrLivArea'] / (data['TotRmsAbvGrd'] + data['FullBath'] +
                                                       data['HalfBath'] + data['KitchenAbvGr'])

data['ConstructionAge'] = data['YrSold'] - data['YearBuilt']

# REMOVING OUTLIERS 

In [None]:
plt.scatter(houses["TotalBsmtSF"],houses["SalePrice"])
plt.title("Total Basement area in Square Feet Vs SalePrice ")
plt.ylabel("SalePrice")
plt.xlabel("Total Basement area in Square Feet");

In [None]:
#there are few outliers in total basement area lets remove them
upperlimit = np.percentile(data.TotalBsmtSF.values, 99.5)
data['TotalBsmtSF'].iloc[data['TotalBsmtSF']>upperlimit] = upperlimit

plt.scatter(data.TotalBsmtSF, data["SalePrice"].values)
plt.title("TotalBsmtSF Vs SalePrice ")
plt.ylabel("SalePrice")
plt.xlabel("Total Basement in sq feet");

In [None]:
plt.scatter(houses.GarageArea, houses["SalePrice"].values,color='violet')
plt.title("Garage Area Vs SalePrice ")
plt.ylabel("SalePrice")
plt.xlabel("Garage Area in sq feet");

In [None]:
#GarageArea has got some outliers lets remove them.
upperlimit = np.percentile(houses.GarageArea.values, 99.5)
houses['GarageArea'].iloc[houses['GarageArea']>upperlimit] = upperlimit

plt.scatter(houses.GarageArea, houses["SalePrice"].values,color='violet')
plt.title("Garage Area Vs SalePrice ")
plt.ylabel("SalePrice")
plt.xlabel("Garage Area in sq feet");

In [None]:
plt.scatter(range(houses.shape[0]), houses["SalePrice"].values,color='orange')
plt.title("Distribution of Sale Price")
plt.xlabel("Number of Occurences")
plt.ylabel("Sale Price");

In [None]:
#there are some outliers.lets remove them.
upperlimit = np.percentile(houses.SalePrice.values, 99.5)
houses['SalePrice'].iloc[houses['SalePrice']>upperlimit] = upperlimit

plt.scatter(range(houses.shape[0]), houses["SalePrice"].values,color='orange')
plt.title("Distribution of Sale Price")
plt.xlabel("Number of Occurences")
plt.ylabel("Sale Price");

In [None]:
def plot_data(col, discrete=False):
    if discrete:
        fig, ax = plt.subplots(1,2,figsize=(14,6))
        sns.stripplot(x=col, y='SalePrice', data=data, ax=ax[0])
        sns.countplot(data[col], ax=ax[1])
        fig.suptitle(str(col) + ' Analysis')
    else:
        fig, ax = plt.subplots(1,2,figsize=(12,6))
        sns.scatterplot(x=col, y='SalePrice', data=data, ax=ax[0])
        sns.distplot(data[col], kde=False, ax=ax[1])
        fig.suptitle(str(col) + ' Analysis')

In [None]:
data = data.drop(data[(data['OverallQual'] == 10) & (data['SalePrice'] < 200000)].index)

In [None]:
plot_data('Total_Bath')

In [None]:
data = data.drop(data[(data['Total_Bath'] > 4) & (data['SalePrice'] < 200000)].index)

In [None]:
plot_data('Total_Bath')

In [None]:
data =data.drop(data[(data['TotalBsmtSF'] > 3000) & (data['SalePrice'] < 400000)].index)

NEN

In [None]:
data.shape

In [None]:
data.reset_index()

TOP FEATURES

In [None]:
top_features = data.corr()[['SalePrice']].sort_values(by=['SalePrice'],ascending=False)
plt.figure(figsize=(12,20))
sns.heatmap(top_features,cmap='rainbow',annot=True,annot_kws={"size": 14},vmin=-1)

Total_Square_Feet, GrLivArea ,TotalBsmtSF,Total_Bath,GarageCars, 1stFlrSF ,GarageArea, SqftPerRoom, TotRmsAbvGrd,MasVnrArea have more than 0.5 correlation with SalePrice.

EnclosedPorch and KitchenAbvGr have little negative correlation with target variable.

These can prove to be important features to predict SalePrice.


In [None]:
corrMatrix=data[["SalePrice","Total_Square_Feet", "GrLivArea" ,"TotalBsmtSF","Total_Bath","GarageCars", 
                 "1stFlrSF" ,"GarageArea","TotRmsAbvGrd","MasVnrArea"]].corr()

sns.set(font_scale=1.10)
plt.figure(figsize=(10, 10))

sns.heatmap(corrMatrix, vmax=.8, linewidths=0.01,
            square=True,annot=True,cmap='viridis',linecolor="white")
plt.title('Correlation b/w features');

Check for Multicollinearity


Strong correlation of these features to other, similar features:


'GrLivArea' and 'TotRmsAbvGrd'


'GarageCars' and 'GarageArea'


'TotalBsmtSF' and 'Total_square_feet'


'TotalBsmtSF' and '1stFirSF'

Of those features we drop the one that has smaller correlation coeffiecient to Target.
'TotRmsAbvGrd' , 'TotalBsmtSF', 'garageArea' removed.


 check the distribution of the ‘SalePrice’ variable in the dataset.

In [None]:
sns.distplot(houses['SalePrice'], color="r", kde=False)
plt.title("Distribution of Sale Price")
plt.ylabel("Number of Occurences")
plt.xlabel("Sale Price");

In [None]:
data['SalePriceSF'] = data['SalePrice']/houses['GrLivArea']
plt.hist(data['SalePriceSF'], bins=15,color="gold")
plt.title("Sale Price per Square Foot")
plt.ylabel('Number of Sales')
plt.xlabel('Price per square feet');

In [None]:
#Average Sale Price per square feet 
print(data.SalePriceSF.mean())

In [None]:
plt.scatter(data['ConstructionAge'], data['SalePriceSF'])
plt.ylabel('Price per square foot (in dollars)')
plt.xlabel("Construction Age of house");

CATEGORICAL

In [None]:
#Heating and AC arrangements
sns.stripplot(x="HeatingQC", y="SalePrice",data=data,hue='CentralAir',jitter=True,split=True)
plt.title("Sale Price vs Heating Quality");

In [None]:
#overall quality

plt.barh(data["OverallQual"],width=data["SalePrice"],color="r")
plt.title("Sale Price vs Overall Quality of house")
plt.ylabel("Overall Quality of house")
plt.xlabel("Sale Price")
plt.show()


In [None]:
#bathrooms
sns.boxplot(data["FullBath"],data["SalePrice"])
plt.title("Sale Price vs Full Bathrooms");

In [None]:
#kitchen quality 
sns.factorplot("KitchenAbvGr","SalePrice",data=data,hue="KitchenQual")
plt.title("Sale Price vs Kitchen");

In [None]:
labels = data["MSZoning"].unique()
sizes = data["MSZoning"].value_counts().values
explode=[0.1,0,0,0,0]
percent = 100.*sizes/sizes.sum()
labels = ['{0} - {1:1.1f} %'.format(i,j) for i,j in zip(labels, percent)]

colors = ['yellowgreen', 'gold', 'lightblue', 'lightcoral','blue']
patches, texts= plt.pie(sizes, colors=colors,explode=explode,
                        shadow=True,startangle=90)
plt.legend(patches, labels, loc="best")

plt.title("Zoning Classification")
plt.show()



In [None]:
from sklearn.cluster import KMeans
neighborhood_prices = houses.copy()
neighborhood_prices = neighborhood_prices[['Neighborhood', 'SalePrice']].groupby('Neighborhood').median().sort_values('SalePrice')

plt.figure(figsize=(15,7))
ax = sns.barplot(x= neighborhood_prices.index, y=neighborhood_prices['SalePrice'], color='grey')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

neighborhood_prices['Cluster'] = KMeans(n_clusters=3).fit(neighborhood_prices).labels_

plt.figure(figsize=(15,7))
ax = sns.barplot(x= neighborhood_prices.index, y=neighborhood_prices['SalePrice'], 
                 hue=neighborhood_prices['Cluster'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()


neighborhood_dict = dict(zip(neighborhood_prices.index, neighborhood_prices.Cluster))
data['Neighborhood_Class'] = data['Neighborhood']
data['Neighborhood_Class'].replace(neighborhood_dict, inplace = True)
print(data['Neighborhood_Class'].head())

In [None]:
import scipy.stats as st
numeric_feats = list()

for i in data.columns:
    if data[i].dtype != 'object':
        numeric_feats.append(i)
        
        
skewed_feats = data[numeric_feats].apply(lambda x: st.skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew Before Transformation' :skewed_feats})

skewness = skewness[abs(skewness) > 1].dropna(axis=0)
skewed_features = skewness.index
for feat in skewed_features:
    data[feat] = np.log1p(data[feat]+1)

skewed_feats = data[skewed_features].apply(lambda x: st.skew(x.dropna())).sort_values(ascending=False)
skewness['Skew After Transformation'] = skewed_feats
skewness

In [None]:
data.columns

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
for i in data.columns:
    if data[i].dtype == 'object':
        LE=LabelEncoder()
        LE.fit(data[i])
        data[i]=LE.transform(data[i])

In [None]:
data_chosen=data.copy()

In [None]:
data_chosen.info()

In [None]:
data_chosen = pd.get_dummies(data_chosen)
print(data_chosen.shape)
data_chosen.head(3)

In [None]:
X = data_chosen.copy()

X.drop(['MiscVal'],axis='columns',inplace=True)
X.drop(['YrSold'],axis='columns',inplace=True)
X.drop(['MSSubClass'],axis='columns',inplace=True)
X.drop(['Neighborhood'],axis='columns',inplace=True)
X.drop(['Condition2'],axis='columns',inplace=True)
X.drop(['Condition1'],axis='columns',inplace=True)
X.drop(['Street'],axis='columns',inplace=True)
X.drop(['GarageType'],axis='columns',inplace=True)
X.drop(['Utilities'],axis='columns',inplace=True)
X.drop(['3SsnPorch'],axis='columns',inplace=True)
X.drop(['RoofStyle'],axis='columns',inplace=True)
X.drop(['Functional'],axis='columns',inplace=True)
X.drop(['RoofMatl'],axis='columns',inplace=True)
X.drop(['MoSold'],axis='columns',inplace=True)
X.drop(['PavedDrive'],axis='columns',inplace=True)
X.drop(['FireplaceQu'],axis='columns',inplace=True)

X.drop(['SalePrice'],axis=1,inplace=True)
y= data_chosen['SalePrice'].values

In [None]:
from sklearn.model_selection import train_test_split
# FEATURE SELECTION & DATA SPLIT

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state=0)
test_id = X_test['Id']

In [None]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.003))
lasso.fit(X_train, y_train)
lasso_preds = np.array(lasso.predict(X_test))
sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = lasso_preds
#sub.to_csv('Lasso Submission 2.csv',index=False)
print("RMSE FOR LASSO: ")
rmse(lasso_preds,y_test)

In [None]:
r2_score(lasso_preds,y_test)

In [None]:
ridge = make_pipeline(RobustScaler(), Ridge(alpha = 50))
ridge.fit(X_train, y_train)
ridge_preds = np.array(ridge.predict(X_test))

sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = ridge_preds
#sub.to_csv('Ridge Submission.csv',index=False)
print("RMSE FOR RIDGE: ")
rmse(ridge_preds,y_test)

In [None]:
r2_score(ridge_preds,y_test)

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(max_depth=18,learning_rate=0.12,colsample_bytree=1,gamma=0.03,subsample=0.5)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
final_xgb=pd.DataFrame()
final_xgb['SalePrice'] = xgb_pred
print("RMSE FOR XGB: ")
rmse(xgb_pred,y_test)

In [None]:
r2_score(xgb_pred,y_test)

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0.003))
lasso.fit(X_train, y_train)
lasso_preds = lasso.predict(X_test)

ridge = make_pipeline(RobustScaler(), Ridge(alpha = 40))
ridge.fit(X_train, y_train)
ridge_preds = ridge.predict(X_test)

xgb = XGBRegressor(max_depth=18,learning_rate=0.12,colsample_bytree=1,gamma=0.03,subsample=0.5)
xgb.fit(X_train,y_train)
xg_preds =xgb.predict(X_test)

weights = [0.60, 0.40]

sub = pd.DataFrame()
sub['Id'] = test_id

ensemble_preds=(lasso_preds*weights[0]) + (xg_preds*weights[1])

#sub['SalePrice'] = (ridge_preds*weights[0]) + (lasso_preds*weights[1]) + (xg_preds*weights[2])
#sub['SalePrice'] = (ridge_preds*weights[0]) + (xg_preds*weights[1])

sub['SalePrice'] = ( (lasso_preds*weights[0]) + (xg_preds*weights[1]))
print("RMSE FOR ENSEMBLE")
rmse(sub['SalePrice'],y_test)


In [None]:
mean_absolute_error(ensemble_preds,y_test)

In [None]:
mean_absolute_error(lasso_preds,y_test)

In [None]:
mean_absolute_error(xg_preds,y_test)

In [None]:
mean_absolute_error(ridge_preds,y_test)

In [None]:
plt.figure(figsize=(15,10))
plt.grid(False)
plt.plot(lasso_preds, 'gd', label='Lasso regression')
plt.plot(ridge_preds, 'b^', label='Ridge Regression')
plt.plot(xgb_pred, 'ys', label='Gradient Boosting')
plt.plot(ensemble_preds, 'r*', label='Ensemble model')

plt.ylabel('predicted')
plt.xlabel('testing samples')
plt.legend(loc="best")
plt.title('Regressor predictions')
plt.show()

In [None]:
plt.scatter(ensemble_preds,)

In [None]:
plt.scatter(X_test['Total_Square_Feet'],ridge_preds)

In [None]:
plt.scatter(X_test['Total_Square_Feet'],xg_preds)

In [None]:
sub['SalePrice'][:10]

In [None]:
y_test[:10]