In [1]:
import pandas as pd

### Data Collection

In [2]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [3]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
df_train = df_train.dropna()

### Data Preprocessing

In [4]:
df_train

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,4,1,7.0,0,-1.0,zero,0,0,192,0,0,0,0,0,11,2008.0,130000
1130,3,1,6.0,0,1964.0,two,504,0,0,0,0,0,0,0,7,2008.0,145000
1294,3,1,7.0,1,1996.0,three,889,220,0,0,0,0,0,0,7,2009.0,265000
860,3,1,6.0,1,1966.0,two,453,188,108,0,0,0,0,0,7,2006.0,155000


In [5]:
df_train.drop(df_train.columns[0],axis = 1)
df_train.fillna(-1)

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,4,1,7.0,0,-1.0,zero,0,0,192,0,0,0,0,0,11,2008.0,130000
1130,3,1,6.0,0,1964.0,two,504,0,0,0,0,0,0,0,7,2008.0,145000
1294,3,1,7.0,1,1996.0,three,889,220,0,0,0,0,0,0,7,2009.0,265000
860,3,1,6.0,1,1966.0,two,453,188,108,0,0,0,0,0,7,2006.0,155000


In [6]:
def namer (x):
    if x == 'two':
        return 2
    elif x == 'one':
        return 1
    elif x == 'zero':
        return 0
    else:
        return 3

df_train['GarageCars'] = df_train['GarageCars'].map(namer)

### Model Building and Evaluation

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

### Generating Submission File

In [8]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')


In [9]:
import torch
from torch import nn
from torch import optim
import numpy as np
X,target = df_train[df_train.columns[:-1]].values , df_train[[df_train.columns[-1]]].values
n,p = X.shape
q = target.shape[1]
Xt = torch.tensor(X, dtype = torch.float32, requires_grad = True)
yt = torch.tensor(target, dtype = torch.float32)
model = nn.Sequential(nn.Linear(p,q))
crit = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr = 0.001)
n_it = 100
for _ in range(n_it):
    optimizer.zero_grad()
    y_pred = model(Xt)
    loss = crit(yt,y_pred)
    loss.backward()
    optimizer.step()



In [10]:
X_test['GarageCars'] = X_test['GarageCars'].map(namer)
def replace(x,mean):
  if pd.isna(x):
    return mean
  else:
    return x
cols = X_test.columns

for col in cols:
    m = X_test[col].mean()
    X_test[col] = X_test[col].apply(lambda x: replace(x, m))

In [11]:
y_pred  = model(torch.tensor(X_test.values,dtype = torch.float32))
y_pred

tensor([[ 185.0317],
        [ 169.6192],
        [ 194.8410],
        [ 232.4527],
        [ 163.4118],
        [ 198.1998],
        [ 196.7027],
        [ 168.8469],
        [ 167.1377],
        [ 157.5826],
        [ 174.0674],
        [ 430.6003],
        [ 185.5203],
        [ 197.7826],
        [ 193.4583],
        [ 408.5291],
        [ 167.4694],
        [ 184.7210],
        [ 183.8329],
        [ 180.8546],
        [ 169.5303],
        [ 160.0351],
        [ 183.6339],
        [ 167.4267],
        [ 164.6653],
        [ 152.9154],
        [ 168.7778],
        [ 173.2085],
        [ 182.9542],
        [ 184.7781],
        [ 196.7059],
        [ 405.1178],
        [ 165.0204],
        [ 165.9274],
        [ 200.0683],
        [ 172.9138],
        [ 220.1917],
        [ 165.0251],
        [ 178.0437],
        [ 152.7399],
        [ 185.6968],
        [ 199.4767],
        [ 169.4483],
        [ 150.2643],
        [ 185.8936],
        [ 209.2257],
        [ 181.0294],
        [ 407

In [12]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred.detach().numpy().squeeze()
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,185.031723
1,1105,169.619202
2,413,194.840973
3,522,232.452682
4,1036,163.411804
