In [None]:
import pandas as pd
import matplotlib.pyplot as plt

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
df_test = pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

### Data Exploration

In [None]:
df_train.head()

In [None]:
df_train.info()

`GarageCars` is an `object`, parse it.

In [None]:
df_train["GarageCars"].unique()

In [None]:
df_train["GarageCars"] = df_train["GarageCars"].map({
    "zero": 0,
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
}, na_action="ignore")

Let's check for missing values

In [None]:
df_train[df_train.isnull().any(axis=1)]

~~Not too many of them, let's just drop them.~~ Let's replace them by -1s.

In [None]:
df_train.fillna(-1, inplace=True)


In [None]:
df_train.describe()

In [None]:
corr_matrix = df_train.corr()
labels = list(corr_matrix.columns)

plt.figure(figsize=(10, 8))
plt.matshow(corr_matrix, cmap="coolwarm")
plt.xticks(range(len(labels)), labels, rotation=45, ha="left")
plt.yticks(range(len(labels)), labels)

plt.show()

### Data Preprocessing

Nothing comes to mind beyond the cleaning already done.

In [None]:
df_test["GarageCars"] = df_test["GarageCars"].map({
    "zero": 0,
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
}, na_action="ignore")

df_test.fillna(-1, inplace=True)

### Model Building and Evaluation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

~~Am just gonna do a linear model with the two most correlated variables `GarageArea` and `TotRmsAbvGrd`.~~ Not good enough (on test dataset). Bruteforcing the issue by doing a linear regression on every column.

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(df_train.drop("SalePrice", axis=1), df_train["SalePrice"], test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_eval)
loss = mean_absolute_error(y_eval, y_pred)

print(loss)

Better.

### Generating Submission File

In [None]:
X_test = df_test
y_test =  model.predict(X_test)

In [None]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_test
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()