In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Data Collection

In [2]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [3]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
df_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [4]:
df_train

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,4,1,7.0,0,-1.0,zero,0,0,192,0,0,0,0,0,11,2008.0,130000
1130,3,1,6.0,0,1964.0,two,504,0,0,0,0,0,0,0,7,2008.0,145000
1294,3,1,7.0,1,1996.0,three,889,220,0,0,0,0,0,0,7,2009.0,265000
860,3,1,6.0,1,1966.0,two,453,188,108,0,0,0,0,0,7,2006.0,155000


In [5]:
print("shape=",df_train.shape)
df_train.shape
print("\nSummary info:")
df_train.info()

shape= (1168, 17)

Summary info:
<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 254 to 1126
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   BedroomAbvGr   1168 non-null   int64  
 1   KitchenAbvGr   1168 non-null   int64  
 2   TotRmsAbvGrd   1167 non-null   float64
 3   Fireplaces     1168 non-null   int64  
 4   GarageYrBlt    1168 non-null   float64
 5   GarageCars     1167 non-null   object 
 6   GarageArea     1168 non-null   int64  
 7   WoodDeckSF     1168 non-null   int64  
 8   OpenPorchSF    1168 non-null   int64  
 9   EnclosedPorch  1168 non-null   int64  
 10  3SsnPorch      1168 non-null   int64  
 11  ScreenPorch    1168 non-null   int64  
 12  PoolArea       1168 non-null   int64  
 13  MiscVal        1168 non-null   int64  
 14  MoSold         1168 non-null   int64  
 15  YrSold         1167 non-null   float64
 16  SalePrice      1168 non-null   int64  
dtypes: float64(3), int64(1

In [6]:
# Display summary statistics
print("\nSummary Statistics:")
print(f"Number different values per column\n{df_train.nunique()}")
df_train.describe()


Summary Statistics:
Number different values per column
BedroomAbvGr       8
KitchenAbvGr       3
TotRmsAbvGrd      12
Fireplaces         4
GarageYrBlt       98
GarageCars         5
GarageArea       387
WoodDeckSF       242
OpenPorchSF      184
EnclosedPorch    105
3SsnPorch         15
ScreenPorch       66
PoolArea           6
MiscVal           19
MoSold            12
YrSold             5
SalePrice        571
dtype: int64


Unnamed: 0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1168.0,1168.0,1167.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1167.0,1168.0
mean,2.872432,1.043664,6.497858,0.607877,1867.833904,469.120719,91.976027,46.300514,23.607021,2.782534,15.701199,2.378425,39.72089,6.336473,2007.802057,179163.84589
std,0.805929,0.212653,1.61833,0.630833,454.504382,210.626791,123.448275,67.180017,62.307174,23.908902,57.516696,36.47796,489.113625,2.692267,1.327656,76848.682649
min,0.0,1.0,2.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,2.0,1.0,5.0,0.0,1957.0,324.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,130000.0
50%,3.0,1.0,6.0,1.0,1977.0,478.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,161625.0
75%,3.0,1.0,7.0,1.0,2000.0,576.0,168.0,66.25,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,212250.0
max,8.0,3.0,14.0,3.0,2010.0,1418.0,736.0,547.0,386.0,320.0,480.0,648.0,15500.0,12.0,2010.0,745000.0


### Data Preprocessing

In [7]:
# 需要进行fillna以及把一些one→1，zero→0, 这种one hot

from sklearn.preprocessing import LabelEncoder

def minimal_data_prep(data_df):
    # fillna缺失值填充
    data_df = data_df.fillna(-1)

    # GarageCars手动映射zero/one/two→0/1/2
    mapping = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4}
    if "GarageCars" in data_df.columns:
        data_df["GarageCars"] = data_df["GarageCars"].replace(mapping)
    
    return data_df


df_train = minimal_data_prep(df_train)
df_test = minimal_data_prep(df_test)

X_test = df_test

### Model Building and Evaluation

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

X_train = df_train.drop("SalePrice", axis=1)
y_train = df_train["SalePrice"]

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#model building
model = LinearRegression()
model.fit(X_tr, y_tr)

#evalueate on the val set
y_val_pred = model.predict(X_val)

# MAE
mae = mean_absolute_error(y_val, y_val_pred)
print("MAE on val set:", mae)

MAE on val set: 32712.383390123217


In [9]:
# MAE = 32712.383390123217 which is a reasonable range for house price prediction, so i'm quite satisfied about the setting,i gonna continue to train the model with all the training data.

# retrain the model 
model.fit(X_train, y_train)

# final prediction on the test set
y_pred = model.predict(X_test)

### Generating Submission File

In [10]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,280753.754672
1,1105,233058.303867
2,413,241446.432441
3,522,232055.131898
4,1036,157121.361077
