In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# /kaggle/input/house-prices-advanced-regression-techniques/train.csv

In [None]:
df_train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_train.head()


In [None]:
# Find Missing Data
# Delete where missing data>50%
# Replace median or 0 for other cols
# Check categorical variables
# Transform them. Convert to string, add ordinal, add dummy variables/one hot encoding
# Check skewness for numerical variables
# Remove outliers


In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
missing_cols = total[total>0]
print(missing_cols)

In [None]:
null_cols = total[total > 600].index
df_train = df_train.drop(columns=null_cols)



In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
missing = total[total>0]
print(missing)

In [None]:
df_train[missing.index].head()

In [None]:
#Handle other missing values by replaceing them with median,0 or NA depending on the column

df_train.loc[:,"GarageType" ] = df_train.loc[:, "GarageType"].fillna("NA")
df_train.loc[:, "GarageFinish"] = df_train.loc[:, "GarageFinish"].fillna("NA")
df_train.loc[:, "GarageQual"] = df_train.loc[:, "GarageQual"].fillna("NA")
df_train.loc[:, "GarageCond"] = df_train.loc[:, "GarageCond"].fillna("NA")
df_train.loc[:, "BsmtQual"] = df_train.loc[:, "BsmtQual"].fillna("NA")
df_train.loc[:, "BsmtCond"] = df_train.loc[:, "BsmtCond"].fillna("NA")
df_train.loc[:, "BsmtExposure"] = df_train.loc[:, "BsmtExposure"].fillna("NA")
df_train.loc[:, "BsmtFinType1"] = df_train.loc[:, "BsmtFinType1"].fillna("NA")
df_train.loc[:, "BsmtFinType2"] = df_train.loc[:, "BsmtFinType2"].fillna("NA")


In [None]:

num_cols = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

for col in num_cols:
    df_train[col].fillna(df_train[col].median(), inplace=True)
    

In [None]:
df_train = df_train[df_train['Electrical'].notnull()]


In [None]:
#Categorical and Numerical columns
cat_cols = df_train.select_dtypes(include='object').columns.tolist()
num_cols = df_train.select_dtypes(exclude='object').columns.tolist()

In [None]:

from scipy.stats import skew
skewness = df_train[num_cols].apply(lambda x: skew(x))
print((skewness.sort_values()))

In [None]:
threshold = 0.75
skewed_cols = skewness[abs(skewness) > threshold].index.tolist()
non_skewed_cols = skewness[abs(skewness) <= threshold].index.tolist()

# Access those subsets
df_skewed = df_train[skewed_cols]
df_non_skewed = df_train[non_skewed_cols]


In [None]:
df_skewed.describe().T

In [None]:
# df_skewed['TotalBsmtSF']
df_skewed_transformed = np.log1p(df_skewed)

### Analyzing skewed columns: 
We need to check how applying log transform to all skewed columns affects the data. So we will compare skewness before and after transformation to deep dive into the data and only apply trasnformation where necessary

In [None]:

#Calculate skewness of transformed columns
skewness_2 = df_skewed.apply(lambda x: skew(x)).sort_index()
# print("skewness before transformation : ",skewness_2.sort_values())
skewness_3 = df_skewed_transformed.apply(lambda x: skew(x)).sort_index()

# Combine into one DataFrame
skew_df = pd.DataFrame({
    'Skewness_Before': skewness_2,
    'Skewness_After': skewness_3
})

# Optional: round to 3 decimals and sort by before-skew
print(skew_df.round(3).sort_values(by='Skewness_Before', ascending=False))


#### *******1. MiscVal*******

In [None]:
original_column = df_skewed['MiscVal']
transformed_column = df_skewed_transformed['MiscVal']

sns.histplot(original_column, kde=True)
plt.title("Before Transformation")
plt.show()

sns.histplot(transformed_column, kde=True)
plt.title("After log1p Transformation")
plt.show()

In [None]:
print("Non-zero MiscVal rows:", (df_train['MiscVal'] > 0).sum())
print("Correlation with target:", df_train['MiscVal'].corr(df_train['SalePrice']))


MiscValue captures the $Value of miscellaneous feature like an elevator,second garage,tennis court or a shed if present in the house else its is 0. 
In our dataset since we only have 52 observatons with a +ve Misc Val with very low correlation to the house price, it would be safe to completely drop the column since its presence is already captured in the Misc_feature column. 

In [None]:
# Dropping MiscVal
df_train.drop('MiscVal', axis=1, inplace=True)


#### *******2. PoolArea*******

In [None]:
print("Non-zero PoolArea rows:", (df_train['PoolArea'] > 0).sum())
print("Correlation with target:", df_train['PoolArea'].corr(df_train['SalePrice']))


In [None]:
# Dropping PoolArea
df_train.drop('PoolArea', axis=1, inplace=True)


#### *******3. LotArea*******

In [None]:
original_column = df_skewed['LotArea']
transformed_column = df_skewed_transformed['LotArea']

sns.histplot(original_column, kde=True)
plt.title("Before Transformation")
plt.show()

sns.histplot(transformed_column, kde=True)
plt.title("After log1p Transformation")
plt.show()

Apply log transformation to Lot Area as it reduces skewness

### Check other variables

In [None]:
# Variables with reduced skewness but still high skewness after transformation
list = ['3SsnPorch','LowQualFinSF','KitchenAbvGr','BsmtFinSF2','ScreenPorch',
        'BsmtHalfBath','EnclosedPorch','BsmtFinSF1','TotalBsmtSF','BsmtUnfSF'] 
            
for i in list:
    print("Non-zero ", i, " rows:", (df_train[i] > 0).sum())
    print("Correlation with target:", df_train[i].corr(df_train['SalePrice']))


### Appply Box Cox

In [None]:
# Applying Box to all features where log transfirmation reduces skewness slightly 
from scipy.stats import boxcox
boxcox_data = {}
boxcox_lambdas = {}

for col in list:
    
    if (df_train[col] <= 0).any():
        shift = abs(df_train[col].min()) + 1
        transformed_col, fitted_lambda = boxcox(df_train[col] + shift)
    else:
        shift = 0
        transformed_col, fitted_lambda = boxcox(df_train[col])
    boxcox_data[col] = transformed_col
    boxcox_lambdas[col] = {'lambda': fitted_lambda, 'shift': shift}




In [None]:
df_boxcox = pd.DataFrame(boxcox_data)
#Check skewness of box_cox transformed data

skewness_4 = df_boxcox.apply(lambda x: skew(x)).sort_index()

# Combine into one DataFrame
skew_df = pd.DataFrame({
    'Skewness_raw': skewness_2,
    'Skewness_After_log': skewness_3,
    'Skewness_After_boxcox': skewness_4
})

# round to 3 decimals and sort by before-skew
print(skew_df.round(3).sort_values(by='Skewness_raw', ascending=False))


Based on the above we will apply **Box Cox** to these 3 columns as skew is lower vs the log transformation: 
1. **BsmtFinSF1**
2. **TotalBsmtSF**
3. **BsmtUnfSF**

In [None]:
x = skew_df[abs(skew_df['Skewness_After_log'])<0.75]


In [None]:
y = skew_df[(abs(skew_df['Skewness_After_log'])>0.75) & (abs(skew_df['Skewness_After_log'])<10)
            & (abs(skew_df['Skewness_After_boxcox'])>0.75)]
y

In [None]:
# Apply final transformations and check for skewness

# --- Drop highly skewed, sparse columns ---
drop_cols = [ '3SsnPorch', 'LowQualFinSF'] # 'MiscVal', 'PoolArea',
df_train.drop(columns=drop_cols, inplace=True)

# --- Log transform columns ---
log_cols = ['1stFlrSF', '2ndFlrSF', 'GrLivArea', 'LotArea', 'MSSubClass', 
            'MasVnrArea', 'OpenPorchSF', 'SalePrice', 'WoodDeckSF'
           ,'BsmtFinSF2', 'BsmtHalfBath', 'EnclosedPorch', 'ScreenPorch']
for col in log_cols:
    df_train[col] = np.log1p(df_train[col])  # log1p = log(x + 1)


# --- Box-Cox transformation  ---
boxcox_cols = ['BsmtFinSF1', 'TotalBsmtSF', 'BsmtUnfSF']
for col in boxcox_cols:
    df_train[col], _ = boxcox(df_train[col] + 1)  # shift if zeros are present

# --- Convert to categorical ---
df_train['KitchenAbvGr'] = df_train['KitchenAbvGr'].astype(str)

# Remove Outliers
df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index)


In [None]:
# Check skewness
num_cols = df_train.select_dtypes(exclude='object').columns.tolist()
skewness = df_train[num_cols].apply(lambda x: skew(x))
print((skewness.sort_values()))


### Key Takeaways: 

1. Dropped cols:
   - MiscVal,
   - PoolArea,
   - 3SsnPorch,
   - LowQualFinSF
2. Log Transformed columns :
   - 1stFlrSF',
   - '2ndFlrSF',
   - 'GrLivArea',
   - 'LotArea',
   - 'MSSubClass',
   - 'MasVnrArea',
   - 'OpenPorchSF',
   - 'SalePrice',
   - 'WoodDeckSF'
3. Reduced but still high skewness:
   - BsmtFinSF2,
   - BsmtHalfBath,
   - EnclosedPorch,
   - ScreenPorch	
4. Other tranformations: BoxCox
   - BsmtFinSF1,
   - TotalBsmtSF ,
   - BsmtUnfSF
  
5. Cateogrical: KitchenAbvGr
   