In [1]:
import pandas as pd

### Data Collection

In [2]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_test.csv


In [3]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [30]:
y_train = df_train["SalePrice"]
X_train = df_train.drop(['GarageCars','SalePrice'], axis=1) # on supprime la colonne qualitative et la colonne à estimer

# on supprime pour l'instant les individus ayant des NaN
y_train = y_train[X_train.notnull().all(axis=1)]
X_train = X_train[X_train.notnull().all(axis=1)]

### Model Building and Evaluation

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [31]:
model = LinearRegression()
model.fit(X_train, y_train)

In [39]:
from sklearn.metrics import mean_absolute_error

def pred_eval(model, X_data, y_target):
    y_pred = model.predict(X_data)
    accuracy = mean_absolute_error(y_target, y_pred)
    print("Accuracy of the model:", accuracy)
    return y_pred

In [46]:
y_pred = pred_eval(model, X_train, y_train)

Accuracy of the model: 32485.113893712973


### Generating Submission File

In [47]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')
X_test[X_test.isnull().any(axis=1)]

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
598,2,1,6,,2005.0,two,556,203,47,0.0,0,0,0,0,8,2007
651,4,2,8,1.0,1999.0,two,672,344,0,,0,0,0,0,5,2010


In [49]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': model.predict(X_test.drop('GarageCars', axis=1).fillna(0)) # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,269839.78068
1,1105,234599.579194
2,413,262127.023157
3,522,223979.785806
4,1036,150636.261203
