In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Data Collection

In [2]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [3]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [4]:
df_train[df_train.isnull().any(axis=1)]

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1189,3,1,6.0,0,1966.0,,408,0,0,0,0,0,0,0,12,2008.0,109900
135,2,1,6.0,0,1981.0,one,308,0,176,0,0,0,0,0,6,,120000
321,2,1,,1,1948.0,one,240,0,130,0,0,0,0,0,11,2007.0,119200


In [5]:
def data_prep(df):
    df['GarageCars'] = df['GarageCars'].replace({
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4
    })
    garage_yr_mode = df.loc[df['GarageYrBlt'] != -1, 'GarageYrBlt'].mode()[0]
    df['GarageYrBlt'] = df['GarageYrBlt'].replace(-1, garage_yr_mode)
    return df

In [6]:
df_train = data_prep(df_train)
# df_train[df_train.isnull().any(axis=1)]
df_train['GarageCars'] = df_train['GarageCars'].fillna(2)  #Because the GarageArea value is around 408, properties generally have two garages.
df_train['YrSold'] = df_train['YrSold'].fillna(df_train['YrSold'].median())
df_train['TotRmsAbvGrd'] = df_train['TotRmsAbvGrd'].fillna(df_train['TotRmsAbvGrd'].median())
print("The number of missing values of train dataset:", df_train.isnull().sum().sum())

The number of missing values of train dataset: 0


  df['GarageCars'] = df['GarageCars'].replace({


### Model Building and Evaluation

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [8]:
y = df_train['SalePrice']
X = df_train[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
             'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
             'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
             'MoSold', 'YrSold']]

In [9]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)

In [11]:
# Evaluate the model

def pred_eval(model, X_data, y_target):
    y_pred = model.predict(X_data)

    mse = mean_squared_error(y_target, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_target, y_pred)
    r2 = r2_score(y_target, y_pred)

    print("Evaluation of the regression model:")
    print(f"  Mean Squared Error (MSE): {mse:.4f}")
    print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.4f}")
    print(f"  R-squared (R²): {r2:.4f}")
    return y_pred

In [12]:
# Predict and eval on the train data
y_pred = pred_eval(model, X_train, y_train)

Evaluation of the regression model:
  Mean Squared Error (MSE): 2044225476.1991
  Root Mean Squared Error (RMSE): 45213.1118
  Mean Absolute Error (MAE): 30940.4365
  R-squared (R²): 0.6627


## Predict test.csv

### Data Collection

In [13]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [14]:
f_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

### Data Preprocessing

In [15]:
df_train = data_prep(df_train)
# df_train[df_train.isnull().any(axis=1)]
df_train['GarageCars'] = df_train['GarageCars'].fillna(2)  #Because the GarageArea value is around 408, properties generally have two garages.
df_train['YrSold'] = df_train['YrSold'].fillna(df_train['YrSold'].median())
df_train['TotRmsAbvGrd'] = df_train['TotRmsAbvGrd'].fillna(df_train['TotRmsAbvGrd'].median())
print("The number of missing values of train dataset:", df_train.isnull().sum().sum())

The number of missing values of train dataset: 0


In [16]:
X_test = pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')
X_test = data_prep(X_test)
# df_test[df_test.isnull().any(axis=1)]
X_test['EnclosedPorch'] = X_test['EnclosedPorch'].fillna(X_test['EnclosedPorch'].median())
X_test['Fireplaces'] = X_test['Fireplaces'].fillna(X_test['Fireplaces'].median())
print("The number of missing values of test dataset:", X_test.isnull().sum().sum())

The number of missing values of test dataset: 0


  df['GarageCars'] = df['GarageCars'].replace({


### Model Training and Prediction

In [17]:
X_train = df_train[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
             'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
             'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
             'MoSold', 'YrSold']]
y_train = df_train['SalePrice']

In [18]:
# Train on all information you have
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

### Generating Submission File

In [19]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
print(submission.head())
print("Saved submission.csv")

     id      SalePrice
0   892  279485.852306
1  1105  226818.631545
2   413  241866.156771
3   522  236504.164510
4  1036  167231.980058
Saved submission.csv
