![](https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcRVYQKCyIGvHTNjxk4gmbnDI0lezi3oZmsenFKnAYnj15g1qM4z&usqp=CAU)

**Predicting sale prices for houses, even stranger ones. And what’s up with that basement?**

In [1]:

import numpy as np # linear algebra
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

plt.style.use('bmh')

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [1]:
#data
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
sample=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [1]:
train.head()

In [1]:
train.info()

**Let's perform some crazy eda**

In [1]:
print(train['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(train['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [1]:
df_num = train.select_dtypes(include = ['float64', 'int64'])


In [1]:
df_num_corr = df_num.corr()['SalePrice'][:-1] # -1 because the latest row is SalePrice
golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("There are  {} strong correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))

In [1]:
for i in range(0, len(df_num.columns), 5):
    sns.pairplot(data=df_num,
                x_vars=df_num.columns[i:i+5],
                y_vars=['SalePrice'])

> Let's build a heatmap for correlation btw variables

In [1]:
corr = df_num.drop('SalePrice', axis=1).corr() # 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='jet', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

**take time to explore data_description.txt **

In [1]:
quantitative_features_list = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF',
    '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']
df_quantitative_values = train[quantitative_features_list]
df_quantitative_values.head()

In [1]:
import operator

individual_features_df = []
for i in range(0, len(df_num.columns) - 1): # -1 because the last column is SalePrice
    tmpDf = df_num[[df_num.columns[i], 'SalePrice']]
    tmpDf = tmpDf[tmpDf[df_num.columns[i]] != 0]
    individual_features_df.append(tmpDf)

all_correlations = {feature.columns[0]: feature.corr()['SalePrice'][0] for feature in individual_features_df}
all_correlations = sorted(all_correlations.items(), key=operator.itemgetter(1))


**getting highlt related variables**

In [1]:
golden_features_list = [key for key, value in all_correlations if abs(value) >= 0.5]
print("There are {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))

In [1]:
features_to_analyse = [x for x in quantitative_features_list if x in golden_features_list]
features_to_analyse.append('SalePrice')

In [1]:
categorical_features = [a for a in quantitative_features_list[:-1] + train.columns.tolist() if (a not in quantitative_features_list[:-1]) or (a not in train.columns.tolist())]
df_categ = train[categorical_features]
#fetching categorical variables

In [1]:
df_categ

**Chi square test for categorical varibales ?**

In [1]:
df_not_num = df_categ.select_dtypes(include = ['O'])
print('There is {} non numerical features including:\n{}'.format(len(df_not_num.columns), df_not_num.columns.tolist()))

**training a pipeline for linear regression as data contains highly correlated data and data follows gaussian distribution**

In [1]:
X_train=train.drop(columns=['SalePrice'])
Y_train=train[['SalePrice']]

In [1]:
num_feat=X_train.select_dtypes(include='number').columns.to_list()
cat_feat=X_train.select_dtypes(exclude='number').columns.to_list()

In [1]:
!pip install impyute

In [1]:
from impyute.imputation.cs import mice

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [1]:
num_pipe=Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipe=Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [1]:
ct=ColumnTransformer(remainder='drop',
                    transformers=[
                        ('numeric', num_pipe, num_feat),
                        ('categorical', cat_pipe, cat_feat)
                    ])
model=Pipeline([
    ('transformer',ct),
    ('poly',PolynomialFeatures(2)),
    ('predictor', Lasso())
])

In [1]:
model.fit(X_train, Y_train)


In [1]:
print(model.score(X_train, Y_train))


In [1]:
def submission(test, model):
    y_pred=model.predict(test)
    sample['SalePrice']=y_pred
    date=pd.datetime.now().strftime(format='%d_%m_%Y_%H-%M_')
    sample.to_csv(f'/kaggle/working/{date}result.csv',index=False)

In [1]:
submission(test,model)