In [196]:
import pandas as pd
import requests
from scipy import stats

In [198]:

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_test.csv


In [200]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

In [202]:
import numpy as np
pd.set_option('future.no_silent_downcasting', True)
df=df_train
df=df.replace("one",int(1))
df=df.replace("two",int(2))
df=df.replace("three",int(3))
df=df.replace("zero",int(0))
df=df.replace("four",int(4))
df_train=df

In [204]:
df_train = df_train.apply(pd.to_numeric, errors='coerce')

df_train.fillna(-1, inplace=True)
df_train[df_train.isnull().any(axis=1)]

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


In [208]:
mode_value = stats.mode(df_train['GarageYrBlt'][df_train['GarageYrBlt'] != -1])
df_train['GarageYrBlt'] = df_train['GarageYrBlt'].replace(-1.0, float(mode_value[0]))

ModeResult(mode=2005.0, count=117)


In [212]:
mode_value = stats.mode(df_train['GarageCars'][df_train['GarageCars'] != -1])
df_train['GarageCars'] = df_train['GarageCars'].replace(-1.0, float(mode_value[0]))
mode_value = stats.mode(df_train['YrSold'][df_train['YrSold'] != -1])
df_train['YrSold'] = df_train['YrSold'].replace(-1.0, float(mode_value[0]))

In [214]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

In [216]:
# Preparing the data
y = df_train['SalePrice']
X = df_train[['BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold','YrSold']]

In [218]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [220]:
linear_reg = LinearRegression()

linear_reg.fit(X_train, y_train)

y_pred = linear_reg.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 31774.017157925024


In [222]:
linear_reg.fit(X, y)

In [230]:
X_teste =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')
pd.set_option('future.no_silent_downcasting', True)
df=X_teste
df=df.replace("one",int(1))
df=df.replace("two",int(2))
df=df.replace("three",int(3))
df=df.replace("zero",int(0))
df=df.replace("four",int(4))
X_teste=df
mode_value = stats.mode(X_teste['GarageYrBlt'][X_teste['GarageYrBlt'] != -1])
X_teste['GarageYrBlt'] =X_teste['GarageYrBlt'].replace(-1.0, float(mode_value[0]))
X_teste[X_teste.isnull().any(axis=1)]

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [234]:
y_Pred = linear_reg.predict(X_teste)
submission = pd.DataFrame({
    'id': X_teste.index,
    'SalePrice': y_Pred
})
submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,279449.864438
1,1105,226593.452505
2,413,242112.27243
3,522,235619.190721
4,1036,167234.185301
