In [1]:
import pandas as pd

### Data Collection

In [2]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_test.csv


In [37]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
df_train

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,4,1,7.0,0,-1.0,zero,0,0,192,0,0,0,0,0,11,2008.0,130000
1130,3,1,6.0,0,1964.0,two,504,0,0,0,0,0,0,0,7,2008.0,145000
1294,3,1,7.0,1,1996.0,three,889,220,0,0,0,0,0,0,7,2009.0,265000
860,3,1,6.0,1,1966.0,two,453,188,108,0,0,0,0,0,7,2006.0,155000


### Data Preprocessing

In [38]:
df_train = df_train.drop("GarageCars", axis = 1)
df_train = df_train.fillna(-1)

In [39]:
count_minus_one = (df_train['GarageYrBlt'] == -1).sum()
df_train = df_train[df_train['GarageYrBlt'] != -1]

### Model Building and Evaluation

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [56]:
y = df_train['SalePrice']
X = df_train[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch','3SsnPorch',  'ScreenPorch','PoolArea', 'MiscVal', "MoSold", "YrSold" ]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [57]:
model = LinearRegression()
model.fit(X_train, y_train)

In [58]:
def pred_eval(model, X_data, y_target):
    y_pred = model.predict(X_data)
    accuracy = mean_absolute_error(y_target, y_pred)
    print(f"Mean Absolute Error: {accuracy}")
    return y_pred
y_pred = pred_eval(model, X_test, y_test)


Mean Absolute Error: 29630.840931564046


#### Final Model

In [59]:
model_final = LinearRegression()
model_final.fit(X, y)

### Generating Submission File

In [62]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')
X_test = X_test.drop("GarageCars", axis = 1)
X_test = X_test.fillna(-1)
X_test = X_test[X_test['GarageYrBlt'] != -1]
y_pred_final = model_final.predict(X_test)

In [63]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred_final # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,273998.945427
1,1105,227777.630998
2,413,257474.059114
3,522,226495.489456
4,1036,162221.497942
