In [60]:
import pandas as pd

### Data Collection

In [61]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

Downloaded module3_exercise_train.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv
Downloaded module3_exercise_test.csv from https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv


In [62]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [63]:

df_train.drop('GarageCars', axis=1, inplace=True)
df_train.fillna(-1, inplace=True)

In [64]:
print(df_train.columns)

Index(['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')


### Model Building and Evaluation

In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define features (X) and target (y)
X = df_train.drop('SalePrice', axis=1)
y = df_train['SalePrice']

# Split data into training (70%) and the rest (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Split 70% train, 30% test

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (817, 15)
Shape of y_train: (817,)
Shape of X_test: (351, 15)
Shape of y_test: (351,)


In [66]:
# Initialize and train the Logistic Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [67]:
from sklearn.metrics import mean_absolute_error

def pred_eval(model, X_data, y_target):
    y_pred = model.predict(X_data)
    mae = mean_absolute_error(y_target, y_pred)
    print("Mean Absolute Error of the model:", mae)
    return y_pred



In [68]:
# Predict and eval on the train data
y_pred_train = pred_eval(model, X_train, y_train)

# Predict and eval on the test data
y_pred_test = pred_eval(model, X_test, y_test)



Mean Absolute Error of the model: 31877.58785533803
Mean Absolute Error of the model: 33924.26183391982


In [69]:
display(y_pred_test)


array([177762.16369213, 157294.80102779,  93589.76313637, 297185.59860724,
       239695.73530549,  79052.95065653, 170958.9294867 , 207042.49363154,
       137146.73732001, 195574.00076577, 170010.31732039, 180497.74127409,
       180325.68672682, 143463.3952158 , 161922.73689485,  76286.17707133,
       192875.51255535, 207377.85519877, 232544.29869276, 194143.12095234,
       123641.37172422, 218520.37643301, 156722.02843355, 128047.35333517,
       293929.6231152 , 193112.4693629 , 246050.75082426, 188898.01215956,
       143066.13413992, 154832.89409935, 207722.33897574, 102020.89972448,
       182178.08937625,  89758.90130897, 186963.48778313, 188987.97250394,
       198156.8565514 , 143697.63905731, 138630.29235777, 169933.36605945,
       131270.86912812, 285298.39638917, 208366.2509761 , 132213.19369327,
       118581.73679791, 124663.54277267, 285723.11962051, 315726.16740466,
       159807.78012901, 245526.81670453, 102590.82516666, 172479.91438745,
       205372.70430841,  

### Generating Submission File

In [70]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [71]:

submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': 0 # your_prediction
})

submission.to_csv('submission.csv', index=False, sep=',')
submission.head()

Unnamed: 0,id,SalePrice
0,892,0
1,1105,0
2,413,0
3,522,0
4,1036,0
