In [None]:
import pandas as pd

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
print(df_train.shape)
df_train.head()

In [None]:
# Display info dataset
print("\nSummary info:")
df_train.info()

### Data Preprocessing

In [None]:
# Display summary statistics
print("\nSummary Statistics:")
print(f"Number different values per column\n{df_train.nunique()}")
#df_train.describe()

In [None]:
df_train[df_train.isnull().any(axis=1)]

In [None]:
# Check for duplicate rows
print("\nDuplicate rows:")
df_train[df_train.duplicated(keep=False)].sort_values(by=df_train.columns.tolist())

In [None]:
# on supprimer la colonne Gargecars car les nombres sont Ã©crits en lettres.
df_train.drop('GarageCars',axis=1, inplace=True)
df_train=df_train.fillna(-1)

### Model Building and Evaluation

In [None]:
# Import necessary libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [None]:
y=df_train["SalePrice"]
X=df_train[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces','GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold']]

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model=LinearRegression()

model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
mean_absolute_error(y_pred,y_test)

### Generating Submission File

In [None]:
X_test_data=  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')
X_test_data.drop('GarageCars',axis=1, inplace=True)
X_test_data=X_test_data.fillna(-1)

In [None]:
y_pred_data=model.predict(X_test_data)

In [None]:
submission = pd.DataFrame({
    'id': X_test_data.index,
    'SalePrice': y_pred_data # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()