<a href="https://colab.research.google.com/github/songhyunsik/kuiotbigdata/blob/main/pytorch/housePrice_by_Professor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [16]:
data = pd.read_csv("/content/train.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [17]:
test = pd.read_csv("/content/test.csv")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [18]:
all_df = pd.concat([data, test], axis=0, sort=False).reset_index(drop=True)
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallC

In [19]:
all_df.fillna(0, inplace=True)
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2919 non-null   object 
 3   LotFrontage    2919 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          2919 non-null   object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2919 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallC

In [20]:
predictor = ['MSSubClass', 'LotFrontage', 'LotArea', 'YearBuilt', 'GarageYrBlt']
columns = ['MSZoning', 'Street']
all_df_oneHot = pd.get_dummies(all_df, columns=columns, drop_first=True)
all_df_oneHot_X =  all_df_oneHot[predictor]
all_df_oneHot_y = all_df_oneHot['SalePrice']
all_df_oneHot_X = pd.DataFrame(StandardScaler().fit_transform(all_df_oneHot_X))

In [21]:
all_df_oneHot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 85 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Id                2919 non-null   int64  
 1   MSSubClass        2919 non-null   int64  
 2   LotFrontage       2919 non-null   float64
 3   LotArea           2919 non-null   int64  
 4   Alley             2919 non-null   object 
 5   LotShape          2919 non-null   object 
 6   LandContour       2919 non-null   object 
 7   Utilities         2919 non-null   object 
 8   LotConfig         2919 non-null   object 
 9   LandSlope         2919 non-null   object 
 10  Neighborhood      2919 non-null   object 
 11  Condition1        2919 non-null   object 
 12  Condition2        2919 non-null   object 
 13  BldgType          2919 non-null   object 
 14  HouseStyle        2919 non-null   object 
 15  OverallQual       2919 non-null   int64  
 16  OverallCond       2919 non-null   int64  


In [22]:
data_oneHot = all_df_oneHot_X[:len(data)]
test_oneHot = all_df_oneHot_X[len(data):]

In [23]:
X_train = data_oneHot
y_train = all_df_oneHot_y[:len(data)]

In [24]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)


In [25]:
y_pred = model.predict(test_oneHot)

In [26]:
y_pred

array([181418.21512639, 182741.70028079, 228165.8483748 , ...,
       225099.5874625 , 172390.19825827, 215303.41194489])

In [27]:
passengerId = test["Id"]
submission = pd.DataFrame({"Id": passengerId, "SalePrice": y_pred})
submission.to_csv("submission_linearReg.csv", index=False)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,181418.215126
1,1462,182741.700281
2,1463,228165.848375
3,1464,223900.234494
4,1465,191568.655252
