In [None]:
import pandas as pd

## Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

### Import train & test datasets

In [None]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",")
df_train

In [None]:
df_test = pd.read_csv('module3_exercise_test.csv', sep=",")
df_test

### Quik check

In [None]:
def quick_check(df, n=5):
    print("üìä Dimensions :", df.shape)
    print("\nüîé Aper√ßu des", n, "premi√®res lignes :")
    display(df.head(n))

    print("\n‚ÑπÔ∏è Infos g√©n√©rales :")
    df.info()

    print("\nüìê Types de colonnes :")
    print(df.dtypes)

    print("\nüìâ Statistiques descriptives (num√©riques) :")
    display(df.describe())

    print("\nüìâ Statistiques descriptives (cat√©gorielles / objets) :")
    display(df.describe(include="O"))

    print("\n‚ùì Valeurs manquantes :")
    print(df.isna().sum())

    print("\nüîÅ Nb de valeurs uniques par colonne :")
    print(df.nunique())


In [None]:
quick_check(df_train)


## Data Preprocessing

In [None]:
df_train = df_train.drop('id', axis=1).fillna(-1)
df_test = df_test.drop('id', axis=1).fillna(-1)


In [None]:
df_train['GarageCars'].value_counts()

### Encoding GarageCars values

In [None]:
mapping = {
    "zero": 0,
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
    "six": 6,
    -1: -1  # ou np.nan si tu veux traiter comme manquant
}

# Mapper et encoder
df_train["GarageCars_num"] = df_train["GarageCars"].map(mapping)
df_test["GarageCars_num"] = df_test["GarageCars"].map(mapping)

# Drop les donn√©es cat√©gorielles
df_train = df_train.drop(columns='GarageCars')
df_test = df_test.drop(columns='GarageCars')

df_train["GarageCars_num"].value_counts()

### Split train dataset into train, val

In [None]:
from sklearn.model_selection import train_test_split

X, y = df_train.drop(columns=['SalePrice']), df_train['SalePrice']
# Split into train+val and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Split test set
X_test = df_test.copy()

## Model Building and Evaluation

### Imports

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

### Model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

### Apply model to validation set

In [None]:
y_val_pred = model.predict(X_val)

### Results on val set 

In [None]:
mae_val = mean_absolute_error(y_val, y_val_pred)
mae_val

### mae < 36 000

In [None]:
y_test_pred = model.predict(X_test)

## Generating Submission File

In [None]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [None]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_test_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()