In [15]:
import pandas as pd

### Data Collection

In [16]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [17]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [18]:
df_train.drop('GarageCars',axis=1,inplace=True)
df_train.fillna(-1,inplace=True)

### Model Building and Evaluation

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [38]:
X = df_train[df_train['GarageYrBlt'] != -1]
X = df_train.drop('SalePrice',axis=1)
y = df_train['SalePrice']

In [39]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_val)
mae  = mean_absolute_error(y_val,y_pred)
print(f"Mean Absolute Error: {mae}")


Mean Absolute Error: 32533.838983537677


### Generating Submission File

In [40]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')
X_test.drop('GarageCars',axis=1,inplace=True)
X_test.fillna(-1,inplace=True)

In [41]:
y_pred = model.predict(X_test)

In [42]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,272019.284918
1,1105,235991.854125
2,413,264733.377225
3,522,226985.240146
4,1036,149959.165645
