# Import Section

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
import joblib

# Model Building

## Model Training

In [8]:
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore')
#RMSLE function
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


def select_feature(train_data, numerical_cols, categorical_cols) -> pd.DataFrame:
    features = train_data[numerical_cols + categorical_cols]
    return features


def select_target(train_data, target_col) -> pd.Series:
    target = train_data[target_col]
    return target

def split_dataset(features, target):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def scale_numerical_features(dataset, numerical_cols, flag):
    if flag:
        scaler_set =  scaler.fit(dataset[numerical_cols])
    scaler_set =  scaler.transform(dataset[numerical_cols])
    return scaler_set

def encode_categorical_features(dataset, categorical_cols, flag):
    if flag:
        encoded_set = encoder.fit(dataset[categorical_cols])
    encoded_set = encoder.transform(dataset[categorical_cols]).toarray() 
    return encoded_set

def combining_features(dataset, numerical_cols, categorical_cols, flag):
    numerical_features = scale_numerical_features(dataset, numerical_cols, flag)
    categorical_features = encode_categorical_features(dataset, categorical_cols, flag)
    processed_feature = np.hstack([numerical_features, categorical_features])
    return processed_feature

def transform_target(y_train):
    y_train_transformed = np.log(y_train + 1)
    return y_train_transformed

def train_model(X_train_prepared, y_train_transformed):
    model = LinearRegression()
    model.fit(X_train_prepared, y_train_transformed)
    #joblib.dump for model
    model_path = 'C:\Pradeepa\SEMESTER2_EPITA\Data Science in production\models\model.joblib'
    #joblib.dump for encoder
    encoder_path = 'C:\Pradeepa\SEMESTER2_EPITA\Data Science in production\models\encoder.joblib'
    joblib.dump(model, model_path)
    joblib.dump(encoder, encoder_path)
    return model


# Model Evaluation

In [31]:

def model_predict(model, X_test_prepared):
    y_pred_raw = model.predict(X_test_prepared)
    y_pred = np.exp(y_pred_raw) - 1  
    return y_pred

def evaluate_model(y_test, y_pred):
    rmsle_score = compute_rmsle(y_test, y_pred)
    print(f'RMSLE: {rmsle_score}')

In [32]:
training_data_df = pd.read_csv('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/dsp-pradeepa-kujulva-arjunan/data/train.csv')

numerical_cols = ['LotArea', 'GrLivArea'] 
categorical_cols = ['MSZoning', 'Street'] 
target_col = 'SalePrice'


def model_training(train_data):
    feature, target = select_feature(train_data, numerical_cols, categorical_cols), select_target(train_data, target_col)
    X_train, X_test, y_train, y_test = split_dataset(feature, target)
    scalar_values = scale_numerical_features(X_train, numerical_cols, True)
    categorical_values = encode_categorical_features(X_train, categorical_cols, True)
    processed_feature_train = combining_features(X_train , numerical_cols, categorical_cols, True)
    processed_feature_test = combining_features(X_test , numerical_cols, categorical_cols, True)
    y_train_transformed = transform_target(y_train)
    model = train_model(processed_feature_train, y_train_transformed)
    y_pred = model_predict(model, processed_feature_test)
    evaluate_model(y_test, y_pred)
    return model, processed_feature_test, y_test

def build_model(data: pd.DataFrame) -> dict[str, str]:
    model_training(data)
    pass

model_performance_dict = build_model(training_data_df)


RMSLE: 0.25


# Model Inference

### Loading test.csv file

In [27]:
test_data = pd.read_csv('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/dsp-pradeepa-kujulva-arjunan/data/test.csv')

### Preprocessing and feature engineering the test set

In [28]:
# Load the trained model and encoder
loaded_model = joblib.load('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/models/model.joblib')
loaded_encoder = joblib.load('C:/Pradeepa/SEMESTER2_EPITA/Data Science in production/models/encoder.joblib')

numerical_cols = ['LotArea', 'GrLivArea'] 
categorical_cols = ['MSZoning', 'Street'] 

In [33]:
test_feature = select_feature(test_data, numerical_cols, categorical_cols)
test_numerical_features = scale_numerical_features(test_feature, numerical_cols, False)
test_categorical_features = encode_categorical_features(test_feature, categorical_cols, False)
processed_test_feature = np.hstack([test_numerical_features, test_categorical_features])

# Make predictions using the loaded model
acutal_prediction = loaded_model.predict(processed_test_feature)
submission_df = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': acutal_prediction})

### Actual Prediction

In [34]:
submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,11.470886
1,1462,12.018708
2,1463,12.159955
3,1464,12.13519
4,1465,11.964447
