In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [None]:
def data_prep(df):
    df['GarageCars'] = df['GarageCars'].replace({
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4
    })
    garage_yr_mode = df.loc[df['GarageYrBlt'] != -1, 'GarageYrBlt'].mode()[0]
    df['GarageYrBlt'] = df['GarageYrBlt'].replace(-1, garage_yr_mode)   
    return df

In [None]:
# Perform operations on the train dataset.
df_train = data_prep(df_train)
# df_train[df_train.isnull().any(axis=1)]
df_train['GarageCars'] = df_train['GarageCars'].fillna(2)  #Because the GarageArea value is around 408, properties generally have two garages.
df_train['YrSold'] = df_train['YrSold'].fillna(df_train['YrSold'].median())
df_train['TotRmsAbvGrd'] = df_train['TotRmsAbvGrd'].fillna(df_train['TotRmsAbvGrd'].median())
print("The number of missing values of train dataset:", df_train.isnull().sum().sum())

In [None]:
# Perform operations on the test dataset.
df_test = pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')
df_test = data_prep(df_test)
# df_test[df_test.isnull().any(axis=1)]
df_test['EnclosedPorch'] = df_test['EnclosedPorch'].fillna(df_test['EnclosedPorch'].median())
df_test['Fireplaces'] = df_test['Fireplaces'].fillna(df_test['Fireplaces'].median())
print("The number of missing values of test dataset:", df_train.isnull().sum().sum())

### Model Building and Evaluation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
x = df_train[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
             'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
             'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
             'MoSold', 'YrSold']]
y = df_train['SalePrice']

In [None]:
# Split into train+val and test
X_train_val, X_test, y_train_val, y_test = train_test_split(x, y, test_size=0.2, random_state=66)

# Split train+val into train and val
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=66)
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
def pred_eval(model, X_data, y_target):
    y_pred = model.predict(X_data)
    mae = mean_absolute_error(y_target, y_pred)
    print('Mean Absolute Error:\n', mae)
    return y_pred

In [None]:
y_pred_train = pred_eval(model, X_val, y_val)

In [None]:
y_pred_test = pred_eval(model, X_test, y_test)

### Generating Submission File

In [None]:
## It performed well on both the validation and training sets. Now retrain the model using all the datas of df_train.
model.fit(x, y)

In [None]:
# "df_test" is the target dataset
test_pred = model.predict(df_test)

In [None]:
submission = pd.DataFrame({
    'id': df_test.index,
    'SalePrice': test_pred
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()