# Import Section

In [2]:
import sys
sys.path.append('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/dsp-pradeepa-kujulva-arjunan')

In [3]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from house_prices import CATEGORICAL_COLUMNS
from house_prices import MODEL_PATH
from house_prices import ENCODER_PATH
from house_prices import NUMERICAL_COLUMNS
from house_prices import SCALER_PATH
from house_prices import TARGET_COLUMNS

  from pandas.core import (


# Model Building

## Model Training

In [4]:
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore')


def compute_rmsle(y_test: np.ndarray,
                  y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


def scale_numerical_features(dataset, flag):
    if flag:
        scaler_set = scaler.fit(dataset[NUMERICAL_COLUMNS])
        joblib.dump(scaler_set, SCALER_PATH)
        scaler_set = scaler.transform(dataset[NUMERICAL_COLUMNS])
    else:
        scaler_set = joblib.load(SCALER_PATH).transform(dataset[NUMERICAL_COLUMNS])
    return scaler_set


def encode_categorical_features(dataset, flag):
    if flag:
        encoded_set = encoder.fit(dataset[CATEGORICAL_COLUMNS])
        joblib.dump(encoded_set, ENCODER_PATH)
        encoded_set = encoder.transform(dataset[CATEGORICAL_COLUMNS]).toarray()
    else:
        encoded_set = joblib.load(ENCODER_PATH).transform(dataset[CATEGORICAL_COLUMNS]).toarray()
    return encoded_set


def preprocessor(dataset, flag):
    numerical_features = scale_numerical_features(dataset, flag)
    categorical_features = encode_categorical_features(dataset, flag)
    processed_feature = np.hstack([numerical_features, categorical_features])
    return processed_feature


def transform_target(y_train):
    y_train_transformed = np.log(y_train + 1)
    return y_train_transformed


def train_model(X_train_prepared, y_train_transformed):
    model = LinearRegression()
    model.fit(X_train_prepared, y_train_transformed)
    joblib.dump(model, MODEL_PATH)
    return model


def model_predict(model, X_test_prepared):
    y_pred_raw = model.predict(X_test_prepared)
    y_pred = np.exp(y_pred_raw) - 1
    return y_pred


# Model Evaluation

In [5]:
def model_predict(model, X_test_prepared):
    y_pred_raw = model.predict(X_test_prepared)
    y_pred = np.exp(y_pred_raw) - 1  
    return y_pred

In [12]:


training_data_df = pd.read_csv('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/dsp-pradeepa-kujulva-arjunan/data/train.csv')

def build_model(data: pd.DataFrame) -> dict[str, str]:
    feature, target = data[NUMERICAL_COLUMNS + CATEGORICAL_COLUMNS] , data[TARGET_COLUMNS]
    X_train, X_test, y_train, y_test =  train_test_split(feature, target, test_size=0.2, random_state=42) 
    processed_feature_train = preprocessor(X_train, True)
    processed_feature_test = preprocessor(X_test, True)
    y_train_transformed = transform_target(y_train)
    model = train_model(processed_feature_train, y_train_transformed)
    y_pred = model_predict(model, processed_feature_test)
    rmsle_score = compute_rmsle(y_test, y_pred)
    result = {'RMSLE': str(rmsle_score)}
    return result


# Model Inference

### Loading test.csv file

In [7]:
test_data = pd.read_csv('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/dsp-pradeepa-kujulva-arjunan/data/test.csv')

### Preprocessing and feature engineering the test set

In [9]:
loaded_model = joblib.load('..\models\model.joblib')
loaded_encoder = joblib.load('..\models\encoder.joblib')


In [10]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    feature = input_data[NUMERICAL_COLUMNS + CATEGORICAL_COLUMNS]
    numerical_features = scale_numerical_features(feature, False)
    categorical_features = encode_categorical_features(feature, False)
    processed_test_feature = np.hstack([numerical_features,
                                        categorical_features])
    acutal_prediction = joblib.load(MODEL_PATH).predict(processed_test_feature)
    submission_df = pd.DataFrame({'Id': input_data['Id'],
                                  'SalePrice': np.exp(acutal_prediction)})
    return submission_df



In [11]:
predictions = make_predictions(test_data)
predictions

Unnamed: 0,Id,SalePrice
0,1461,95883.155677
1,1462,165828.339970
2,1463,190985.985944
3,1464,186314.255659
4,1465,157070.014211
...,...,...
1454,2915,108379.339111
1455,2916,108364.128095
1456,2917,160800.535924
1457,2918,138019.363092
