In [None]:
%load_ext autoreload
%autoreload 2

## Libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from utils import *
from visualization import *
from feature_selectors.categorical_feature_selectors import MutualInformationSelector
from feature_selectors.correlation_feature_selector import CorrelationFeatureSelector
from pathlib import Path

## Train data

In [None]:
root_dir = Path("data").resolve()
filename = 'train.csv'

In [None]:
df = read_data_from_csv(root_dir, filename).drop(columns=["Id"])
target = df["SalePrice"]

## Numerical Features Analysis

In [None]:
n_df = df[get_numeric_columns(df)]


In [None]:
n_df.describe()

## Impute the NaN Values of Numerical Features with the `mean` Value of the Feature Column

In [None]:
for col in columns_with_nans(n_df).index:
    n_df = feature_fill_nan_with_value(n_df, col, n_df[col].mean())

## Correlation Map of Numerical Features

In [None]:
correlation_heat_map(n_df, method='spearman', title="Numerical Feature")

## Scatter Plot for Numerical Features

In [None]:
scatter_plots_against_target(n_df, target)

## Feature Selections of Numerical Features

In [None]:
n_df = CorrelationFeatureSelector(method='spearman', threshold=0.2).fit_transform(n_df, target)

In [None]:
correlation_heat_map(n_df, method='spearman', title='Correlation Map of Reduced Features')

## Scatter plot of Reduced Features

In [None]:
scatter_plots_against_target(n_df, target)

In [None]:
n_df = n_df.drop(columns='SalePrice')

## Analysis of Categorical Features

In [None]:
c_df = df[get_categorical_columns(df)]

In [None]:
c_df.describe()

In [None]:
columns_with_nans(c_df)

## Fill the missed values of NA values with 'NA' instead of 'NaN'

In [None]:
c_df = fill_nan(c_df, "NA")

## Boxplot of categorical Features VS Target

In [None]:
categorical_vs_target_plot(c_df, target)

## Feature Selection of Categorical Features

In [None]:
c_df = MutualInformationSelector(mi_threshold=0.1).fit_transform(c_df, target)

In [None]:
categorical_vs_target_plot(c_df, target)

## Target Variable Distribution Plot

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))  # 1 row, 2 columns

# First plot
plot_target_distribution(
    target.to_numpy(),
    "Original Sale Price Distribution",
    xlabel="Sale Price",
    ylabel="Frequency",
    ax=axes[0]
)

# Second plot
plot_target_distribution(
    np.log(target.to_numpy()),
    "Log Sale Price Distribution",
    xlabel="Log(Sale Price)",
    ylabel="Frequency",
    ax=axes[1]
)

plt.tight_layout()
plt.show()