In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)


In [None]:
file = pd.read_csv('./train.csv')
dfOriginal = pd.DataFrame(file).drop(columns=('Id'))

fileTest = pd.read_csv('./test.csv')
dfTest = pd.DataFrame(fileTest).drop(columns=('Id'))

INDEXES = {
  'train': {
    'start': 0,
    'end': dfOriginal.shape[0] - 1,
  },
  'test': {
    'start': dfOriginal.shape[0],
    'end': dfOriginal.shape[0] - 1 + dfTest.shape[0] - 1
  }
}

df = result_df = pd.concat([dfOriginal, dfTest], ignore_index=True)
df

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
categoryTypeColumns = list(df.select_dtypes(exclude=['number']).columns)
numericalColumns = list(df.select_dtypes(include=['int64', 'float64']))

In [None]:
# Fill null or N/A to categorical columns
df[categoryTypeColumns] = df[categoryTypeColumns].apply(lambda col: col.fillna(col.mode()[0]))

In [None]:
def convertCategoryColumns(dataFrame, categoryTypeColumns):
  # convert categorical to true false
  for var in categoryTypeColumns:
    cat_list = pd.get_dummies(dataFrame[var], prefix=var)
    dataFrame = dataFrame.join(cat_list)

  # remove categorical
  allColumns = dataFrame.columns.values.tolist()
  keepColumns = [i for i in allColumns if i not in categoryTypeColumns]
  dataFrame = dataFrame[keepColumns]

  return dataFrame

In [None]:
# convert categorical columns with one-hot-encoding
dfConverted = convertCategoryColumns(dataFrame=df, categoryTypeColumns=categoryTypeColumns)

In [None]:
dfConverted.head()

In [None]:
dfConverted.tail()

In [None]:
# fill out N/A value with mean value
dfConverted.fillna(dfConverted.mean(numeric_only=True), inplace=True)

In [None]:
missingData = dfConverted.isnull().sum()
dataTypeColumns = dfConverted.dtypes
dataTypeColumns
pd.set_option('display.max_rows', None)

missingDataDf = pd.DataFrame({'Missing Value': missingData, 'Data Type': dataTypeColumns})

print(missingDataDf)
pd.reset_option('display.max_rows')

# Expet missing value / null value is 0

In [None]:
# separate Train and Test DataFrames
dfTrain = dfConverted.loc[:INDEXES['train']['end']]
dfTest = dfConverted.loc[INDEXES['test']['start']:INDEXES['test']['end']]

# get X and Y train
X_train = dfTrain.drop(columns=('SalePrice'), axis=1)
y_train = dfTrain['SalePrice']

# get X Test
X_test = dfTest.drop(columns=('SalePrice'), axis=1)

dfTrain

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

modelRFRegressor = RandomForestRegressor(n_estimators=1000, random_state=42)
modelRFRegressor.fit(X_train, y_train)


In [None]:
scoreRFRegressor = modelRFRegressor.score(X_train,y_train)
print(f'Score: {scoreRFRegressor}')

predictedSalePricesRFR = modelRFRegressor.predict(X_test)
print(predictedSalePricesRFR)

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression

modelLinearRegression = LinearRegression()
modelLinearRegression.fit(X_train, y_train)

In [None]:
scoreModelLinearRegression = modelLinearRegression.score(X_train, y_train)
print(f'Score: {scoreModelLinearRegression}')

predictedSalePricesLR = modelLinearRegression.predict(X_test)
print(predictedSalePricesLR)

In [None]:
dataPredictedSales = pd.DataFrame({'RF': predictedSalePricesRFR, 'LR': predictedSalePricesLR})
dataPredictedSales

In [None]:
dfCompare = pd.DataFrame({
  'RFR': predictedSalePricesRFR,
  'LR': predictedSalePricesLR,
})

dfCompare['Diff'] = dfCompare['RFR'] - dfCompare['LR']
print(dfCompare)
print(dfCompare.describe())