In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
df_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [None]:
df_train

In [None]:
print("shape=",df_train.shape)
df_train.shape
print("\nSummary info:")
df_train.info()

In [None]:
# Display summary statistics
print("\nSummary Statistics:")
print(f"Number different values per column\n{df_train.nunique()}")
df_train.describe()

### Data Preprocessing

In [None]:
# 需要进行fillna以及把一些one→1，zero→0, 这种one hot

from sklearn.preprocessing import LabelEncoder

def minimal_data_prep(data_df):
    # fillna缺失值填充
    data_df = data_df.fillna(-1)

    # GarageCars手动映射zero/one/two→0/1/2
    mapping = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4}
    if "GarageCars" in data_df.columns:
        data_df["GarageCars"] = data_df["GarageCars"].replace(mapping)
    
    return data_df


df_train = minimal_data_prep(df_train)
df_test = minimal_data_prep(df_test)

X_test = df_test

### Model Building and Evaluation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

X_train = df_train.drop("SalePrice", axis=1)
y_train = df_train["SalePrice"]

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#model building
model = LinearRegression()
model.fit(X_tr, y_tr)

#evalueate on the val set
y_val_pred = model.predict(X_val)

# MAE
mae = mean_absolute_error(y_val, y_val_pred)
print("MAE on val set:", mae)

In [None]:
# MAE = 32712.383390123217 which is a reasonable range for house price prediction, so i'm quite satisfied about the setting,i gonna continue to train the model with all the training data.

# retrain the model 
model.fit(X_train, y_train)

# final prediction on the test set
y_pred = model.predict(X_test)

### Generating Submission File

In [None]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()