In [None]:
import pandas as pd

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module3/exercise/module3_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module3_exercise_train.csv')
download_file(test_data_url, 'module3_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module3_exercise_train.csv", sep=",", index_col='id')

### Data Preprocessing

In [None]:

df_train.drop('GarageCars', axis=1, inplace=True)
df_train.fillna(-1, inplace=True)

In [None]:
print(df_train.columns)

### Model Building and Evaluation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define features (X) and target (y)
X = df_train.drop('SalePrice', axis=1)
y = df_train['SalePrice']

# Split data into training (70%) and the rest (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Split 70% train, 30% test

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
# Initialize and train the Logistic Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error

def pred_eval(model, X_data, y_target):
    y_pred = model.predict(X_data)
    mae = mean_absolute_error(y_target, y_pred)
    print("Mean Absolute Error of the model:", mae)
    return y_pred



In [None]:
# Predict and eval on the train data
y_pred_train = pred_eval(model, X_train, y_train)

# Predict and eval on the test data
y_pred_test = pred_eval(model, X_test, y_test)



### Generating Submission File

In [None]:
X_test =  pd.read_csv("module3_exercise_test.csv", sep=",", index_col='id')

In [None]:
X_test.fillna(-1, inplace=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=-1)
y_pred = model.predict(X_test)

In [None]:
submission = pd.DataFrame({
    'id': X_test.index,
    'SalePrice': y_pred
})

submission.to_csv('submission.csv', index=False, sep=',')
display(submission.head())