In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# libraries for training data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
training_data = pd.read_csv("/Users/saishdesai/Documents/Kaggle-Comeptitions/house-prices-advanced-regression-techniques/train.csv")
Y = training_data['SalePrice']
X = training_data.drop(columns = ['SalePrice'])
test_data = pd.read_csv("/Users/saishdesai/Documents/Kaggle-Comeptitions/house-prices-advanced-regression-techniques/test.csv")
X = pd.concat([X,test_data],ignore_index=True,axis=0)


In [3]:
# Dataframe with all numerical data types
data_num = X.select_dtypes(np.number)
print(data_num.columns)
print(len(data_num.columns))

# Dataframe with all object data types
data_obj = X.select_dtypes('object')
print(data_obj.columns)
print(len(data_obj.columns))

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')
37
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC'

In [4]:
# Converting object variables to object variables
bsmt_dict = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,np.nan:0}
ext_dict = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1}
d1,d2,d3,d4,d5,d6,d7,d8,d9,d10 = [],[],[],[],[],[],[],[],[],[]

for i in range(len(data_obj['ExterQual'])):
    d1.append(ext_dict[data_obj['ExterQual'][i]])
    
for i in range(len(data_obj['ExterCond'])):
    d2.append(ext_dict[data_obj['ExterCond'][i]])
    
for i in range(len(data_obj['HeatingQC'])):
    d3.append(ext_dict[data_obj['HeatingQC'][i]])

for i in range(len(data_obj['KitchenQual'])):
    d4.append(bsmt_dict[data_obj['KitchenQual'][i]])
    
for i in range(len(data_obj['BsmtQual'])):
    d5.append(bsmt_dict[data_obj['BsmtQual'][i]])

for i in range(len(data_obj['BsmtCond'])):
    d6.append(bsmt_dict[data_obj['BsmtCond'][i]])
    
for i in range(len(data_obj['FireplaceQu'])):
    d7.append(bsmt_dict[data_obj['FireplaceQu'][i]])
    
for i in range(len(data_obj['GarageQual'])):
    d8.append(bsmt_dict[data_obj['GarageQual'][i]])
    
for i in range(len(data_obj['GarageCond'])):
    d9.append(bsmt_dict[data_obj['GarageCond'][i]])
    
for i in range(len(data_obj['PoolQC'])):
    d10.append(bsmt_dict[data_obj['PoolQC'][i]])

# Adding newly converted rows to numerical dataframe
data_num['ExterQual'] = d1
data_num['ExterCond'] = d2
data_num['HeatingQC'] = d3
data_num['KitchenQual'] = d4
data_num['BsmtQual'] = d5
data_num['BsmtCond'] = d6
data_num['FireplaceQu'] = d7
data_num['GarageQual'] = d8
data_num['GarageCond'] = d9
data_num['PoolQC'] = d10

data_obj.drop(columns = ['ExterQual','ExterCond','HeatingQC','KitchenQual','BsmtQual','BsmtCond','FireplaceQu','GarageQual',
                         'GarageCond','PoolQC'], inplace = True)




In [5]:
# Dataframe with all numerical data types
print(data_num.columns)
print(len(data_num.columns))
# Dataframe with all object data types
print(data_obj.columns)
print(len(data_obj.columns))

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'ExterQual', 'ExterCond', 'HeatingQC',
       'KitchenQual', 'BsmtQual', 'BsmtCond', 'FireplaceQu', 'GarageQual',
       'GarageCond', 'PoolQC'],
      dtype='object')
47
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'Fo

In [6]:
# Dropping columns with missing values
for col in data_num.columns:
    if np.where(pd.isnull(data_num[col]))[0].size > 0:
        data_num.drop(col,inplace = True, axis =1)

In [7]:
validation_data  = data_num[data_num['Id']>1460]
validation_data.reset_index()
model_data = data_num[data_num['Id']<1461]

In [8]:
lm = LinearRegression()
scores = cross_val_score(lm,model_data, Y,scoring='r2', cv=5)
print(scores)
lm.fit(model_data,Y)
yhat = lm.predict(validation_data)

[0.85430614 0.80698323 0.81827171 0.81918398 0.67953342]


In [28]:
yhat = pd.Series(yhat) 
index = list(validation_data["Id"])
data_tuples = list(zip(index,yhat))

In [33]:
submission_dataset = pd.DataFrame(data_tuples, columns=['Id','SalePrice'])
submission_dataset.set_index(['Id'], inplace =True)

In [34]:
submission_dataset.to_csv("submission.csv")