In [None]:
import numpy as np
import pandas as pd

%run preprocess_functions_03.ipynb

In [None]:
#######################
### Preparing Data
#######################

def encode_train_test(df_train, df_test):
    # Encode categorical data (on train & test combined)
    df_train["train"] = 1
    df_test["train"] = 0

    combined = pd.concat([df_train, df_test])
    combined['f_brand'] = combined['f_brand'].astype(object)

    
    # removing float columns
    f_pricy_brands_mean_log_price = combined["f_pricy_brands_mean_log_price"]
    combined.drop(["f_pricy_brands_mean_log_price"], axis=1, inplace= True)

    combined_dummies = pd.get_dummies(combined)
    
    # delete correlated columns
    correlated_cols = ["f_longtime_member_null", "f_fast_safe_shipping_null", "f_same_day_shipping_null", "f_free_shipping_null",
                      "f_longtime_member_null"] # save only f_returns_null
    combined_dummies.drop(correlated_cols, axis =1, inplace = True)
    
    # adding float columns
    combined_dummies["f_pricy_brands_mean_log_price"] = f_pricy_brands_mean_log_price
    
    
    # combined_dummies = pd.get_dummies(combined)

    # Split again
    df_train = combined_dummies[combined_dummies["train"] == 1]
    df_test = combined_dummies[combined_dummies["train"] == 0]

    df_train = df_train.drop(columns=["train"])
    df_test = df_test.drop(columns=["train"])
    
    return df_train, df_test


def split_train_to_X_y(df_train):
    # Split train data to X and y
    X_train = df_train.drop(columns="log_price")
    y_train = df_train["log_price"]
    return X_train, y_train


def prepare_train_test_data(df):
    # add log_price column
    df["log_price"] = np.log(df["price"])
    
    # Split the data into train and test sets
    test_size = 0.20
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="log_price"), df["log_price"], test_size=test_size)
    
    # Run once in the beginning, on whole train data (Some preprocessing we need to do)
    preprocess_country_region_of_manufacture(pd.concat([X_train, y_train], axis=1))
    write_all_seen_brands_to_file(pd.concat([X_train, y_train], axis=1))
    write_brands_mean_prices_to_file(pd.concat([X_train, y_train], axis=1))
    map_brand(pd.concat([X_train, y_train], axis=1))
    
    # Preprocess train and test data (Separetly!)
    X_train = process_df_for_model_3(X_train)
    X_test = process_df_for_model_3(X_test)

    # Encode categorical teatures
    X_train, X_test = encode_train_test(X_train, X_test)
    
    return X_train, X_test, y_train, y_test



def prepare_data_for_testing(df_train, df_test):
    # Add log_price
    df_train["log_price"] = np.log(df_train["price"])
    
    # Run once in the beginning, on whole train data (Some preprocessing we need to do)
    preprocess_country_region_of_manufacture(df_train)
    write_all_seen_brands_to_file(df_train)
    write_brands_mean_prices_to_file(df_train)
    map_brand(df_train)
    
    
    # Preprocess train and test data (Separetly!)
    df_train = process_df_for_model_3(df_train)
    df_test = process_df_for_model_3(df_test)
    
    df_train, df_test = encode_train_test(df_train, df_test)
    
    X_train = df_train.drop(columns=["log_price"])
    y_train = df_train["log_price"]
    return X_train, df_test, y_train