In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
# load the data
path='../input/regression/Ames_Housing_Sales.csv'
housing_data=pd.read_csv(path)
housing_data.head()

In [None]:
#view the basic info of the data
housing_data.info()

In [None]:
#get the idea about the outliers and whether it needs scalling or not
housing_data.describe()

In [None]:
mask = housing_data.dtypes == np.object

In [None]:
#separate categorical and numerical columns
categorical_col = housing_data.dtypes[mask].index
numerical_col = housing_data.dtypes[~mask].index

In [None]:
# create a copy fo data (good practice this way you don't loose original data)
housing_data_copy=housing_data.copy()

In [None]:
def scatterplot(data,x,y):
    """
        function takes 
        data : Dataframe
        x : x-axis
        y : y_axis
        
        Returns
        ScatterPlot between x and y
    """
    sns.scatterplot(data=housing_data_copy,x=x,y=y)
    plt.title("Plot Between "+x+" And Sales Price")
    return plt.show()

In [None]:
for col in numerical_col:
    scatterplot(housing_data_copy,col,'SalePrice')

- Analysis For Numerical columns
    - 1stFlrSF
        - It is so much in Linear relation with the SalePrice.
        - As the value increase the SalePrize also increased.
    - 1ndFlrSF
        - It is in polynomial relation with the SalePrice.
        - As the value increse the SalePrize increased Rapidly.
    - 3SsnPorch
        - Most of the Data are zero.
        - Data which are not zero, Is too much scatter.
        - No underlying pattern.
        - Choose to Drop the Column.
    - BedroomAbvGr
        - Houses having BedroomAbvGr in between 1-4 have more SalesPrice.
        - Houses having more or less BedroomAbvGr are less SalesPrice.
    - BsmtFinSF1
        - Some values are zero still have significant high SalePrice.
        - Non-Zero Values have a rising pattern as the BsmtFinSF1 increases.
    - BsmtFinSF2
        - Most values are zero and still have a lot higher SalePrice.
        - Non-Zero Values are mid-ranged SalePrice, where as zero values houses are but priced more and less.
        - SalePrice is not dependent on BsmtFinSF2.
        - Choose to drop the column.
    - BsmtFullBath
        - Houses having no or 1 BsmtFullBath are priced more than the rest.
        - It indicate having more BsmtFullBath doesn't necessarily mean to have a higher price.
    - BsmtHalfBath
        - Having 0 BsmtHalfBath have more SalePrice.
        - Values having anything more then 0 offer less maximum SalePrice but the minimum SalePrice of the house is incresed significantly.
    - BsmtUnfSF
        - Although the Data seems to be scattered, but examaning closely we can find that it is having polynomial relationship with SalePrice.
        - At first the values are constant as doesn't shows any deviation, later as the values keep increasing the SalePrice tends to increase.
    - EnclosedPorch
        - Most of the houses have no EnclosedPorch still they have a high SalePrice.
        - While those houses having a fireplace shows no change in SalePrice.
        - Choose to drop the column.
    - Fireplaces
        - Clearly visible as the number of fireplace increases the SalePrice of the house increase.
        - However the after a certain point it Started being constant and then started decreasing.
    - FullBath
        - It can be seen that the variable have a polynomial relationship with SalePrice.
        - As the values increase the SalePrice increases.
    - GarageArea
        - We can see there is gradual increase in SalePrice as the values increase.
        - And for extreme high values they decrease.
    - GarageCars
        - It can be seen that the variable have a polynomial relationship with SalePrice.
        - Minimum price of SalePrice of the houses increase with increase in values.
    - GarageYrBlt
        - Newly built are more prefered by the consumers and are more valuable than other.
        - And have more SalePrice values thean the older ones.
    - GrLivArea
        - It can be seen that the variable have a polynomial relationship with SalePrice.
        - SalePrice is in increaing pattern with the value.
    - HalfBath
        - Having 1 HalfBath have more SalePrice.
        - As number of Halfbath increases minimum SalePrice of the house also increases.
    - KitchenAbvGr
        - Haivng 1 KitchenAbvGr tends to have higher SalePrice.
        - However as number of KitchenAbvGr increases minimum SalePrice of the house also increases.
    - LotArea
        - Haivng a decent LotArea offers a high SalePrice.
        - Having too large or too short lot area offers low SalePrice.
    - LotFrontage
        - Data is concentrated to one point.
        - It doesn't affect the SalePrice much.
        - Choose to drop the column.
    - LowQualFinSF
        - Most of the data points are zero.
        - non-zero data points doesn't have any underlying pattern.
        - Choose to drop the column.
    - MSSubClass
        - The data points have too much uncertainity.
        - For low value of MSSubClass SalePrice if too low and started increasing, then falls down and started increasing again aftere some time.
        - the reason can be that it is not contributing towards the SalePrice of the houses.
        - Choose to drop the column.
    - MasVnrArea
        - Although the data points are zero.
        - but we can observe that the SalePrice is in increasing order with respect to MasVnrArea.
    - MiscVal
        - Many values are zero still the manage to get a higher SalePrice.
        - Clearly mean that this feature is not contributing towards the SalePrice.
        - Choose to drop the column.
    - MoSold
        - We can se the graph is constant for each case.
        - We cant interpret any pattern from the visulization.
        - Choose to drop the column
    - OpenPorchSF
        - Most of the values are zero.
        - for non zero values as the price increases the minimum SalePrice also increases.
    - OverallCond
        - The better the condition of the house the higher SalePrice it will be.
        - Also the minimum SalePrice of the house is also increased.
    - OverallQual
        - The Quality of house increases it increase teh SalePrice to a large extent.
        - It can be an important feature in the dataset
    - PoolArea
        - 98% of the values are zero and still have high SalePrice.
        - This is clear it is not an important predictor.
    - ScreenPorch
        - Most values are zero.
        - For Non-zero values as the value of ScreenPorch increase minimum SalePrice of the house also increases.
    - TotRmsAbvGrd
        - It can be seen that the variable have a polynomial relationship with SalePrice.
        - Also the minimum SalePrice of the house is also increased.
    - TotalBsmtSF
        - Increase in TotalBsmtSF will result in increase in SalePrice.
        - It is due to the polynomial relationship exist between them.
    - WoodDeckSF
        - Having WoodDeckSF will definitely increase the SalePrice 
        - Also the minimum SalePrice of the house is also increased.
    - YearBuilt
        - Latest bulit houses are tend to sell at higher SalePrice.
        - Also there are high number of new built houses.
    - YearRemodAdd
        - Houses with range in 1980 to 2000 are tend to have high minimum SalePrice.
        - while other for others we can see the increasing pattern in SalePrice
    - YrSold
        - We can observe that it is constant throughout.
        - Conclusively we can say it doesn't contribute in SalePrice.

In [None]:
#drop unwanted column
columns_to_drop=['3SsnPorch','BsmtFinSF2','EnclosedPorch','LotFrontage','LowQualFinSF','MSSubClass','MiscVal','MoSold','SalePrice']

In [None]:
#redefine numerical column
numerical_col = list((set(numerical_col)-set(columns_to_drop)))

In [None]:
def countplot(data,x):
    """
        function takes 
        data : Dataframe
        x : x-axis
               
        Returns
        Countplot of x
    """
    sns.countplot(data=housing_data_copy,x=x)
    plt.title('Countplot for '+x)
    plt.xticks(rotation=90)
    
def boxplot(data,x,y):
    """
        function takes 
        data : Dataframe
        x : x-axis
        y : y_axis
        
        Returns
        Boxplot between x and y
    """
    sns.boxplot(data=housing_data_copy,x=x,y=y)
    plt.title('Boxplot for '+x)
    plt.xticks(rotation=90)

In [None]:
#group the data of categorical columns
#if any category has less than 100 observations it will combine it under the name Others

for col in categorical_col:
    grouped = housing_data_copy.groupby(col)[col].count()
    for value in grouped.index:
        if grouped[value]<100:
            housing_data_copy[col].replace(value,'Others',inplace=True)

In [None]:
for col in categorical_col:
    plt.figure(figsize=(13,6))
    plt.subplot(1,2,1)
    boxplot(housing_data_copy,col,'SalePrice')
    plt.subplot(1,2,2)
    countplot(housing_data_copy,col)

In [None]:
#Now we drop the columns we don't require
columns_to_drop.extend(['BsmtFinType2','Condition2','Electrical','Fence','Functional','MiscFeature','PavedDrive','Street','Utilities','Exterior2nd','Exterior1st','CentralAir','Heating'])

In [None]:
#redefine numerical column
categorical_col=list(set(categorical_col)-set(columns_to_drop))

In [None]:
#Now for Categorical column we should separate out the column for label Encoding and One-Hot Encoding. Because we want to work differently on both.
column_for_label_encoding=['BsmtCond','BsmtExposure','BsmtQual','Condition1','ExterCond','ExterQual','FireplaceQu','GarageQual','HeatingQC','KitchenQual','GarageCond']
column_for_hot_encoding=list(set(categorical_col)-set(column_for_label_encoding))

In [None]:
#dataframe to store out model results
Scores=pd.DataFrame(columns=['Model','Polynomial_degree','Alpha_value','Train_score(MAE)','Test_score(MAE)','R2_score(test_data)'])

## Feature Engineering

In [None]:
# use to labal data if they are in series ('Good',  'Bad', 'Worst)
oe=OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=10)

# use to hot encode the categorical data 
ohe=OneHotEncoder(handle_unknown='ignore')

# It is used to scale the numerical data 
ss=StandardScaler()

In [None]:
# Column Transformer will help use to word differently on different columns within a dataframe
transformer = ColumnTransformer(transformers=[('num',ss,numerical_col),
                                           ('ordinal',oe,column_for_label_encoding),
                                           ('hotencode',ohe,column_for_hot_encoding)])

##  Model Creation

In [None]:
# It will create folds with in our data frame so we can use them to Validate our model performance over all the present data
folds=KFold(n_splits=3,shuffle=True,random_state=10)

In [None]:
total_column=categorical_col+numerical_col

In [None]:
#Seprate out our Features and target variable
X=housing_data_copy[total_column]
y=housing_data_copy.SalePrice

###  Linear Regression

In [None]:
for deg in [1,2,3]:
    #adding polynomial expression to data
    pf=PolynomialFeatures(degree=deg)

    #model creation
    LR=LinearRegression()

    #creation of preprocessor  and pipeline
    preprocessor=Pipeline(steps=[('transform',transformer),
                                 ('polynomial',pf)])
    model_pipeline=Pipeline(steps=[('preprocess',preprocessor),
                              ('model',LR)])

    #to collect prediction scores
    train_score=[]
    test_score=[]
    r2_scores=[]

    #iterating over various folds
    for train_index,test_index in folds.split(X):
        #train and test data split
        train_X, train_y = X.iloc[train_index], y.iloc[train_index]
        test_X, test_y = X.iloc[test_index], y.iloc[test_index]

        #fit data to model
        model_pipeline.fit(train_X,train_y)

        #train and test prediction
        pred_test=model_pipeline.predict(test_X)
        pred_train=model_pipeline.predict(train_X)

        #appending the prediction score in terms of MEAN ABSOLUTE ERROR
        train_score.append(mean_absolute_error(train_y,pred_train))
        test_score.append(mean_absolute_error(test_y,pred_test))
        r2_scores.append(r2_score(test_y,pred_test))
        
    Scores=Scores.append(
        {'Model':'Linear_regression',
         'Polynomial_degree':deg,
         'Alpha_value':np.nan,
         'Train_score(MAE)': np.mean(train_score),
         'Test_score(MAE)': np.mean(test_score),
         'R2_score(test_data)': np.mean(r2_scores)},
        ignore_index=True)

In [None]:
#alpha values for Lasso and Ridge regression models
alphas=[10,0.1,0.001,1e-5,1e-9]

### Lasso Regression

In [None]:
for alpha_val in alphas:
    for deg in [1,2]:
        pf=PolynomialFeatures(degree=deg)
        lasso=Lasso(alpha=alpha_val,max_iter=100000,)
        preprocessor=Pipeline(steps=[('transform',transformer),
                                     ('polynomial',pf)])
        model_pipeline=Pipeline(steps=[('preprocess',preprocessor),
                                       ('linear_model',lasso)])

        #to collect prediction scores
        train_score=[]
        test_score=[]
        r2_scores=[]
        #iterating over various folds
        for train_index,test_index in folds.split(X):
            #train and test data split
            train_X, train_y = X.iloc[train_index], y.iloc[train_index]
            test_X, test_y = X.iloc[test_index], y.iloc[test_index]
            
            #fit data to model
            model_pipeline.fit(train_X,train_y)
            
            #train and test prediction
            pred_test=model_pipeline.predict(test_X)
            pred_train=model_pipeline.predict(train_X)
            
            #appending the prediction score in terms of MEAN ABSOLUTE ERROR
            train_score.append(mean_absolute_error(train_y,pred_train))
            test_score.append(mean_absolute_error(test_y,pred_test))
            r2_scores.append(r2_score(test_y,pred_test))

        Scores=Scores.append(
            {'Model':'Lasso',
             'Polynomial_degree':deg,
             'Alpha_value':alpha_val,
             'Train_score(MAE)': np.mean(train_score),
             'Test_score(MAE)': np.mean(test_score)},
            ignore_index=True)

### Ridge Regression

In [None]:
for alpha_val in alphas:
    for deg in [1,2,3]:
        #adding polynomial expression to data
        pf=PolynomialFeatures(degree=deg)
        
        #model creation
        ridge=Ridge(alpha=alpha_val)
        
        #creation of preprocessor  and pipeline
        preprocessor=Pipeline(steps=[('transform',transformer),
                                     ('polynomial',pf)])
        model_pipeline=Pipeline(steps=[('preprocess',preprocessor),
                                  ('model',ridge)])
        
        #to collect prediction scores
        train_score = []
        test_score = []
        r2_scores = []
        
        #iterating over various folds
        for train_index,test_index in folds.split(X):
            #train and test data split
            train_X, train_y = X.iloc[train_index], y.iloc[train_index]
            test_X, test_y = X.iloc[test_index], y.iloc[test_index]
            
            #fit data to model
            model_pipeline.fit(train_X,train_y)
            
            #train and test prediction
            pred_test=model_pipeline.predict(test_X)
            pred_train=model_pipeline.predict(train_X)
            
            #appending the prediction score in terms of MEAN ABSOLUTE ERROR
            train_score.append(mean_absolute_error(train_y,pred_train))
            test_score.append(mean_absolute_error(test_y,pred_test))
            r2_scores.append(r2_score(test_y,pred_test))

        Scores=Scores.append(
            {'Model':'Ridge',
             'Polynomial_degree':deg,
             'Alpha_value':alpha_val,
             'Train_score(MAE)': np.mean(train_score),
             'Test_score(MAE)': np.mean(test_score),
             'R2_score(test_data)': np.mean(r2_scores)},
            ignore_index=True)

## Model Accuracy

In [None]:
#viewing the scores of models performance and sorting it in assending order in order to get perfect model
Scores.sort_values(by=['R2_score(test_data)', 'Test_score(MAE)', 'Train_score(MAE)'], ascending=[False,True,True])

Here we can see Ridge model with alpha 10 and polynomial degree 1 is performing best.

So by this we can choose the model with params which fits out model in best way.