In [1]:
#!pip install pandas
import pandas as pd

### Data Collection

In [2]:
#!pip install requests
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [3]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 254 to 1126
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   BedroomAbvGr   1168 non-null   int64  
 1   KitchenAbvGr   1168 non-null   int64  
 2   TotRmsAbvGrd   1167 non-null   float64
 3   Fireplaces     1168 non-null   int64  
 4   GarageYrBlt    1168 non-null   float64
 5   GarageCars     1167 non-null   object 
 6   GarageArea     1168 non-null   int64  
 7   WoodDeckSF     1168 non-null   int64  
 8   OpenPorchSF    1168 non-null   int64  
 9   EnclosedPorch  1168 non-null   int64  
 10  3SsnPorch      1168 non-null   int64  
 11  ScreenPorch    1168 non-null   int64  
 12  PoolArea       1168 non-null   int64  
 13  MiscVal        1168 non-null   int64  
 14  MoSold         1168 non-null   int64  
 15  YrSold         1167 non-null   float64
 16  SalePrice      1168 non-null   int64  
dtypes: float64(3), int64(13), object(1)
memory usage: 164.2

In [5]:
def pipeline(df):
    # On supprime toutes les colonnes qui ne sont pas des int ou des float
    df = df.select_dtypes(include=["int64", "float64"]).fillna(-1)


    # On remplace les valeurs manquantes par -1
    df.fillna(-1, inplace=True)
    return df

df_train=pipeline(df_train)


In [6]:
"""
remove = []
for column in df_train.columns:
    if df_train[column].dtype not in ["int64", "float64"]:
        remove.append(column)

df_train.drop(remove, axis=1, inplace=True)
"""

'\nremove = []\nfor column in df_train.columns:\n    if df_train[column].dtype not in ["int64", "float64"]:\n        remove.append(column)\n\ndf_train.drop(remove, axis=1, inplace=True)\n'

### Model Building and Evaluation

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

#### Create a train and a test set

In [8]:
seed = 42

In [9]:
y = df_train['SalePrice']
X = df_train.drop("SalePrice", axis=1)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

#### Entrainement du modèle

In [11]:
model = LinearRegression()
model = model.fit(X_train, y_train)

#### Prediction

In [12]:
y_pred = model.predict(X_test)

#### Evaluation du modèle

In [13]:
mse = mean_absolute_error(y_test, y_pred)
print(mse)

32533.838983537476


### Generating Submission File

In [14]:
df_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [15]:
df_test = pipeline(df_test)

In [16]:
y_pred_eval = model.predict(df_test)

In [17]:
submission = pd.DataFrame({
    'id': df_test.index,
    'SalePrice': y_pred_eval # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,272019.284918
1,1105,235991.854125
2,413,264733.377225
3,522,226985.240146
4,1036,149959.165645
