Here, in order to resolve the possible issue of dealing with unknown further encoded categorical columns while regressing, I marked both datasets with the label 'train' with the intention to concatenate them into one, then encode all of the categorical variables, and split again based on the 'train' label.


In [24]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# from statsmodels.stats.diagnostic import het_breuschpagan
# from statsmodels.stats.diagnostic import het_white
# from statsmodels.formula.api import ols


train_df = pd.read_csv('https://raw.githubusercontent.com/MacPaw/msi2021-data-science/main/data/train.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/MacPaw/msi2021-data-science/main/data/test.csv')


train_df_1, test_df_1 = train_df.copy(), test_df.copy()
train_df_1['train'] = 1
test_df_1['train'] = 0
all_df = pd.concat([train_df_1, test_df_1])



# creating a concatenated dataframe, encoding all categorical columns,
# splitting the dataframe into train and test data
# building a model on train data and then 
def convert_all_categorical(inp_df):
    """
    Returns a new dataframe with encoded categorical columns.
    Uses pd.get_dummies() function
    To avoid multicollinearity, one needs to drop one variable -
    for a column with n different categories, there will be created n - 1 columns.
    """
    
    categorical_columns = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape',
                       'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
                       'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                       'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle',
                       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
                       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
                       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                       'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
                       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
                       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
                       'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
    
    inp_df = inp_df.fillna(0)

    return pd.get_dummies(inp_df, columns=categorical_columns, prefix=categorical_columns, drop_first=True)



# Was trying to conduct tests for heteroskedasticity,
# but the White test was making the session run out of memory, while the 
# Breusch-Pagan test, being more time-efficient, raised some error I could not 
# resolve.


def relevant_variables(inp_df):
    """
    Function for choosing the important variables for a linear regression model
    based on the p-value (which should be < 0.05, otherwise - neglected).
    """
    Y_1 = inp_df[['SalePrice']]
    Y = np.array(Y_1)
    X = inp_df.copy()
    X.drop('SalePrice', axis=1, inplace=True)
    X.drop('Id', axis=1, inplace=True)
    X_1 = np.array(X)
    X2 = sm.add_constant(X_1)
    est = sm.OLS(np.log(Y), X2).fit()
    indices_to_keep = list()
    p_values_list = est.pvalues
    for i in range(1, len(p_values_list)):
        if p_values_list[i] < 0.05:
            indices_to_keep.append(i - 1) 
            # since 0-th element is the intercept
            # and we want to keep the indices of the columns that are relevant
            # by p-value
    X_columns = X.columns
    cols_to_keep = [X_columns[idx] for idx in indices_to_keep]
    return X[cols_to_keep]


def main():
    """
    Splitting the data back to the train and test.
    Creating a linear model with the relevant variables,
    calculating the metrics - r-squared, adjusted r-squared, rmse.
    """
    converted = convert_all_categorical(all_df)
    train, test = converted[converted['train'] == 1], converted[converted['train'] == 0]
    train.drop('train', axis=1, inplace=True)
    test.drop('train', axis=1, inplace=True)

    train_X = relevant_variables(train)

    # filling the test_X dataframe with the same columns as the train_X,
    # but values of those columns are 0's or 1's according to the encoding of 
    # each row vector in the test dataset; if the encoded by train_X column 
    # is not in the test dataset, then fill that column with zeros
    test_X = test[[col for col in train_X.columns]]

    train_Y = train['SalePrice']

    model = LinearRegression().fit(train_X, train_Y)
    tr_X_1 = sm.add_constant(train_X)
    result = sm.OLS(np.log(train_Y), tr_X_1).fit()
    r_squared, adj_r_squared = result.rsquared, result.rsquared_adj

    y_actual = train['SalePrice']
    y_predicted = model.predict(train_X)
    mse = mean_squared_error(y_actual, y_predicted)
    rmse = mse**(1/2)
    
    y_test_predicted = model.predict(test_X)
    ids = test_df['Id']
    result = pd.DataFrame()
    result['Id'] = ids
    result['SalePrice'] = y_test_predicted
    result.to_csv('prediction.csv')
    # the adj r-squared is 0.9 and the Root Mean Square Error is around 25000
    # which are signs of high efficiency of this linear model.
    return f"R-squared = {r_squared}, Adjusted R-squared = {adj_r_squared}, RMSE = {rmse}", result 


print(main())











A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


('R-squared = 0.904844106407562, Adjusted R-squared = 0.900833965177595, RMSE = 25269.899769145035',         Id      SalePrice
0     1461  119448.567989
1     1462  161365.060296
2     1463  193900.117320
3     1464  193410.546670
4     1465  231515.137932
...    ...            ...
1454  2915   77986.961149
1455  2916   93990.168819
1456  2917  194192.341003
1457  2918  119261.011146
1458  2919  234014.863018

[1459 rows x 2 columns])
