## Goals: 

Melakukan Prediksi 'Sale Price' dari data yang diberikan 

In [None]:
## Uncomment apabila mau install pacakges
# %pip install numpy pandas seaborn matplotlib scikit-learn

#### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Configurasi pandas untuk display semua kolom
pd.set_option('display.max_columns', None)

#### Load File 

load file train & test

In [None]:
file = pd.read_csv('./train.csv')
dfOriginal = pd.DataFrame(file).drop(columns=('Id'))

fileTest = pd.read_csv('./test.csv')
dfTest = pd.DataFrame(fileTest).drop(columns=('Id'))

#### Gabungkan file dan berikan index

In [None]:
df = result_df = pd.concat([dfOriginal, dfTest], ignore_index=True)

INDEXES = {
  'train': {
    'start': 0,
    'end': dfOriginal.shape[0] - 1,
  },
  'test': {
    'start': dfOriginal.shape[0],
    'end': dfOriginal.shape[0] - 1 + dfTest.shape[0] - 1
  }
}

df

#### Mengecek head & tail
memastikan bahwa file yang kita import sudah benar

In [None]:
df.head()

In [None]:
df.tail()

#### mengambil informasi kolom yang bersifat Kategorikal & Numerikal

In [None]:
categoryTypeColumns = list(df.select_dtypes(exclude=['number']).columns)
numericalColumns = list(df.select_dtypes(include=['int64', 'float64']))

#### Memproses data yang kosong 

Setiap kolom kategorikal, diisi value yang `null` dengan nilai yang sering muncul pada kolom tersebut dengan method `fillna`


In [None]:
# Fill null or N/A to categorical columns
df[categoryTypeColumns] = df[categoryTypeColumns].apply(lambda col: col.fillna(col.mode()[0]))

#### Deklarasi functions yang diperlukan 

In [None]:
def processCategoricalColumnsWithOneHotEncoding(dataFrame, categoryTypeColumns):
  # convert categorical to true false
  for var in categoryTypeColumns:
    cat_list = pd.get_dummies(dataFrame[var], prefix=var)
    dataFrame = dataFrame.join(cat_list)

  # remove categorical
  allColumns = dataFrame.columns.values.tolist()
  keepColumns = [i for i in allColumns if i not in categoryTypeColumns]
  dataFrame = dataFrame[keepColumns]

  return dataFrame

#### Konversi kolom kategorikal dengan one-hot-encoding
kolom yang bersifat kategorikal menjadi nilai absolut yang bisa dibaca oleh komputer dengan metode one-hot-encoding


In [None]:
# convert categorical columns with one-hot-encoding
dfConverted = processCategoricalColumnsWithOneHotEncoding(dataFrame=df, categoryTypeColumns=categoryTypeColumns)

#### Pencekan Head & Tail file
memastikan bahwa dataframe berhasil di-convert

In [None]:
dfConverted.head()

In [None]:
dfConverted.tail()

#### Memproses data yang kosong 

Setiap kolom numerikal, diisi value yang `null` dengan nilai rata-rata pada kolom tersebut dengan method `fillna`


In [None]:
# fill out N/A value with mean value
dfConverted.fillna(dfConverted.mean(numeric_only=True), inplace=True)

#### Memastikan bahwa tidak ada nilai `null` pada dataframe

In [None]:
missingData = dfConverted.isnull().sum()
dataTypeColumns = dfConverted.dtypes
dataTypeColumns
pd.set_option('display.max_rows', None)

missingDataDf = pd.DataFrame({'Missing Value': missingData, 'Data Type': dataTypeColumns})

print(missingDataDf)
pd.reset_option('display.max_rows')

# Expet missing value / null value is 0

#### Pisahkan dataframe 
Train & Test yang telah digabungkan dipisahkan berdasarkan nomor index yang sudah didefinisikan diawal

In [None]:
# separate Train and Test DataFrames
dfTrain = dfConverted.loc[:INDEXES['train']['end']]
dfTest = dfConverted.loc[INDEXES['test']['start']:INDEXES['test']['end']]

# get X and Y train
X_train = dfTrain.drop(columns=('SalePrice'), axis=1)
y_train = dfTrain['SalePrice']

# get X Test
X_test = dfTest.drop(columns=('SalePrice'), axis=1)

dfTrain

#### Modeling dengan Random Forest Regressor

In [16]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

modelRFRegressor = RandomForestRegressor(n_estimators=1000, random_state=42)
modelRFRegressor.fit(X_train, y_train)


#### Mengecek score model dengan method `.score()`

In [None]:
scoreRFRegressor = modelRFRegressor.score(X_train,y_train)
print(f'Score: {scoreRFRegressor}')

predictedSalePricesRFR = modelRFRegressor.predict(X_test)
print(predictedSalePricesRFR)

bisa disimpulkan score yang dihasilkan dengan **Random Forest Regressor** 

n_estimators = `1000`

random_state = `42`

mendapatkan score `0.98`


---

#### Modeling dengan Linear Regression

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression

modelLinearRegression = LinearRegression()
modelLinearRegression.fit(X_train, y_train)

#### Mengecek score model dengan method `.score()`

In [None]:
scoreModelLinearRegression = modelLinearRegression.score(X_train, y_train)
print(f'Score: {scoreModelLinearRegression}')

predictedSalePricesLR = modelLinearRegression.predict(X_test)
print(predictedSalePricesLR)

bisa disimpulkan score yang dihasilkan dengan **Linear Regression** 

mendapatkan score `0.93`

lebih rendah dibandingkan dengan menggunakan **RFR**


---

Dilihat dari scorenya maka model yang lebih baik digunakan yaitu Random Forest Regressor

---


In [None]:
### Mengecek R2 Score dengan cara split dataframe dari Dataframe train
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dfTrain.drop(columns=['SalePrice']), dfTrain['SalePrice'], test_size=0.2)

In [None]:
modelRFRegressor = RandomForestRegressor(n_estimators=1000, random_state=42)
modelRFRegressor.fit(X_train, y_train)
y_pred = modelRFRegressor.predict(X_test)

# Scatter plot of actual vs. predicted sale prices for Random Forest Regressor Model
plt.figure(figsize=(12, 4))  # Adjust size of the plot
plt.scatter(y_test, y_pred, color='g', alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Plotting the diagonal line
plt.xlabel('Actual Sale Prices')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction')
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print("mean squared error: ",mean_squared_error(y_test, y_pred))
print("mean absolute error test: ",mean_absolute_error(y_test, y_pred))
print("r2 score test: ",r2_score(y_test, y_pred))


In [None]:
modelLinearRegression = LinearRegression()
modelLinearRegression.fit(X_train, y_train)
y_pred = modelLinearRegression.predict(X_test)

# Scatter plot of actual vs. predicted sale prices for Random Forest Regressor Model
plt.figure(figsize=(12, 4))  # Adjust size of the plot
plt.scatter(y_test, y_pred, color='g', alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Plotting the diagonal line
plt.xlabel('Actual Sale Prices')
plt.ylabel('Prediction')
plt.title('Actual vs. Prediction')
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print("mean squared error: ",mean_squared_error(y_test, y_pred))
print("mean absolute error test: ",mean_absolute_error(y_test, y_pred))
print("r2 score test: ",r2_score(y_test, y_pred))
