In [317]:
import pandas as pd

### Data Collection

In [318]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/module3/exercise/module3_exercise_test.csv


In [319]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')
df_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

### Data Preprocessing

In [320]:
df_train

Unnamed: 0_level_0,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254,2,1,4.0,1,1977.0,two,440,0,55,0,0,165,0,0,7,2010.0,127500
1066,1,1,5.0,1,1983.0,two,612,349,40,0,0,0,0,0,9,2009.0,316600
638,4,1,10.0,1,1998.0,two,420,144,123,0,0,0,0,0,7,2006.0,258000
799,3,1,8.0,0,1916.0,one,180,0,0,0,140,0,0,0,8,2009.0,135000
380,2,1,5.0,0,2005.0,two,438,108,0,0,0,0,0,0,3,2006.0,167240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,4,1,7.0,0,-1.0,zero,0,0,192,0,0,0,0,0,11,2008.0,130000
1130,3,1,6.0,0,1964.0,two,504,0,0,0,0,0,0,0,7,2008.0,145000
1294,3,1,7.0,1,1996.0,three,889,220,0,0,0,0,0,0,7,2009.0,265000
860,3,1,6.0,1,1966.0,two,453,188,108,0,0,0,0,0,7,2006.0,155000


In [321]:
#Delete missing values
df_train = df_train.dropna()
df_test = df_test.dropna()
# Delete duplicates
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()
#Delete string features
df_train = df_train.drop(columns=["GarageCars"])
df_test = df_test.drop(columns=["GarageCars"])

In [322]:
#Delete the -1 values
df_train = df_train[df_train['GarageYrBlt'] != -1]
df_test = df_test[df_test['GarageYrBlt'] != -1]

### Model Building and Evaluation

In [323]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

### Generating Submission File

In [324]:
y = df_train["SalePrice"]
X = df_train.copy()
X = X.drop(columns=["SalePrice"])

In [325]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [326]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)

In [327]:
def pred_eval(model, X_data, y_target):
    y_pred = model.predict(X_data)
    accuracy = mean_absolute_error(y_target, y_pred)
    # conf_matrix = confusion_matrix(y_target, y_pred)
    # class_report = classification_report(y_target, y_pred)
    print("Accuracy of the model:", accuracy)
    # print("Confusion Matrix:\n", conf_matrix)
    # print("Classification Report:\n", class_report)
    return y_pred

pred_eval(model, X_train, y_train)
pred_eval(model, X_test, y_test)

Accuracy of the model: 33035.09502496538
Accuracy of the model: 35965.65221985154


array([147032.16393443, 137857.43171806, 209758.87850467, 183246.7654321 ,
       183562.84435798, 261721.1125    , 304982.32727273, 354290.22997416,
       195178.08227848, 131361.30188679, 220206.38297872, 185608.36619718,
       217932.65648855, 127196.20720721, 225616.86184211, 382418.96      ,
       145705.96825397, 162401.94117647, 378039.33928571, 211056.04958678,
       137857.43171806, 139248.47524752, 245587.08560311, 190961.39416058,
       317328.31151832, 162401.94117647, 201228.86842105, 131361.30188679,
       213108.73333333, 209467.39269406, 205150.        , 233006.28082192,
       192263.01785714, 138786.68831169, 209758.87850467, 186585.60784314,
       135857.52688172, 304594.84756098, 205150.        , 127196.20720721,
       242282.59745763, 127196.20720721, 277990.93678161, 178458.86      ,
       195367.04255319, 273935.34482759, 279099.53691275, 192263.01785714,
       125173.53439153, 139248.47524752, 148518.42857143, 209414.1589404 ,
       183562.84435798, 1

In [328]:
model = AdaBoostRegressor()
model.fit(X, y)

In [329]:
submission = pd.DataFrame({
    'id': df_test.index,
    'SalePrice': model.predict(df_test)
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,317218.45122
1,1105,201908.744318
2,413,209403.354167
3,522,180086.256579
4,1036,164999.379592
