# EDA, Handling Categorical Values and Handling Missing Values

## Imports & Load Data:

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Complete Route to proyect directory
os.chdir("/media/roovedot/common/VSrootWorkspace/House-Price-Predictions-with-Random-Forest-Regression-Model")
#Load data from Train file
housing = pd.read_csv("data/train.csv")
housing.head() #Preview of data to check correct Load


## Getting to know the Data:


### Basic Info

In [None]:

housing.info()

### Distributions for every Column

In [None]:
for col in housing.select_dtypes(include=['int64', 'float64']).columns:
    plt.figure(figsize=(10, 6))  # Set figure size

    sns.displot(housing[col])
    
    # Set plot title and labels
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    
    # Display the plot
    plt.show()

## Initial Correlations:

In [None]:
# Select all numeric columns
hous_num = housing.select_dtypes(include=['float64', 'int64'])

# Get each Feature's correlation with 'SalePrice'
hous_num_corr = hous_num.corr()['SalePrice'].sort_values(ascending=False)  # Ordenar de mayor a menor

# Print all correlations in order
for var, corr in hous_num_corr.items():
    print(f"{var}: {corr:.3f}") # corr:.2f formats corr values to 2 decimals

## Categorical Values:

### Encoding Categorical Values (with One-hot Encoding)

In [None]:
house_cat = housing.select_dtypes(include=['object'])  # Select categorical columns
house_cat_encoded = pd.get_dummies(house_cat, dummy_na=True)  # One-hot encode categorical variables. dummy_na creates a column indicating missing value

# Join "Saleprice" Target to hous_cat_encoded
# axis=1 Ensures SalePrice gets Passed as a Column and not a row
house_cat_encoded = pd.concat([house_cat_encoded, housing["SalePrice"]], axis=1) 

# Get correlations of each Category with the Target
house_cat_encoded_corr = house_cat_encoded.corr()['SalePrice'].sort_values(ascending=False)  # Ordenar de mayor a menor

# Print all correlations in order
for var, corr in house_cat_encoded_corr.items():
    print(f"{var}: {corr:.3f}") # corr:.2f formats corr values to 2 decimals'''


In [None]:
house_cat_encoded.columns.tolist()

### Keeping only Valuable Categories:

In [None]:
# Drop all Columns which have an Correlation below 0.4 Absolute Value
for col, corr in house_cat_encoded_corr.items():
    # If the absolute value of the correlation is less than 0.4
    
    if (abs(corr) < 0.4 and col in house_cat_encoded.columns) or pd.isna(corr):
        # inplace=True ensures operating on the original Dataframe
        # axis=1 tells drop() method we are dropping columns and not rows
        house_cat_encoded.drop(columns=col, axis=1, inplace=True)  # Drop Column

# Remove 'SalePrice' before merging with the original DataFrame
house_cat_encoded.drop(columns='SalePrice', axis=1, inplace=True)

# Convert all boolean columns in the DataFrame to integer type for compatibility
house_cat_encoded = house_cat_encoded.astype(int)

house_cat_encoded.info()

### Merge Encoded Data:

In [None]:
# Concatenate Selected encoded categorical variables with the original dataframe
housing = pd.concat([housing, house_cat_encoded], axis=1)

# Drop the original categorical columns to avoid redundancy
housing = housing.drop(columns=house_cat.columns)

housing.info()

## Missing Values:

Most of the missing Values were on categorical Columns, which we have already handled with One-Hot Encoding, even getting valuable info from missing values, like the case of FireplaceQu_nan, with a correlation of -0.472

In [None]:
# 20 Columns with the most missing Values
housing.isnull().sum().sort_values(ascending=False).head()

The missing values in LotFrontage, GarageYrBlt and MasVnrArea probably mean these features are not on the house.  

Since I think them missing is actually valuable information, I will create a boolean column for each one indicating if the value is missing, and I will set the value of the feature to 0 except for GarageYrBlt.

### Handling Missing Values:

In [None]:
# WARNING: If you execute this cell more than once, it will set all the indicator columns to 0, making it useless

housing_na = housing[["LotFrontage", "GarageYrBlt", "MasVnrArea"]]

for column in housing_na.columns:
    if column == "GarageYrBlt": # For GarageYrBlt, we will set the value to the mean instead of 0
        #Create column indicating if value was missing
        housing[f'{column}_nan'] = housing[column].isnull().astype(int)

        # Impute missing values with value 0
        housing[column].fillna(housing["GarageYrBlt"].mean(), inplace=True)
    else:
        #Create column indicating if value was missing
        housing[f'{column}_nan'] = housing[column].isnull().astype(int)

        # Impute missing values with value 0
        housing[column].fillna(0, inplace=True)

housing.info()


## Updated Correlations:

In [None]:
housing.info() #Check data is properly transformed

In [None]:
# Get each Feature's correlation with 'SalePrice'
housing_corr = housing.corr()['SalePrice'].sort_values(ascending=False)  # Ordenar de mayor a menor

# Print all correlations in order
for var, corr in housing_corr.items():
    print(f"{var}: {corr:.3f}") # corr:.2f formats corr values to 2 decimals

## Saving File for the Next Step of Cleaning: 

In [8]:
housing.to_csv('data/train_catH_naH.csv')