In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns

import re

sns.set_style('whitegrid')
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

In [2]:
from imblearn.over_sampling import SMOTE

In [3]:
df_train = pd.read_csv("data/competition/train.csv")
df_test = pd.read_csv("data/competition/test.csv")
df_combine = pd.concat([df_train, df_test])#, axis = 1)

In [4]:
df_combine.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2919 entries, 0 to 1458
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallCond  

In [5]:
df_combine.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,2919.0,1460.0,842.787043,1.0,730.5,1460.0,2189.5,2919.0
MSSubClass,2919.0,57.137718,42.517628,20.0,20.0,50.0,70.0,190.0
LotFrontage,2433.0,69.305795,23.344905,21.0,59.0,68.0,80.0,313.0
LotArea,2919.0,10168.11408,7886.996359,1300.0,7478.0,9453.0,11570.0,215245.0
OverallQual,2919.0,6.089072,1.409947,1.0,5.0,6.0,7.0,10.0
OverallCond,2919.0,5.564577,1.113131,1.0,5.0,5.0,6.0,9.0
YearBuilt,2919.0,1971.312778,30.291442,1872.0,1953.5,1973.0,2001.0,2010.0
YearRemodAdd,2919.0,1984.264474,20.894344,1950.0,1965.0,1993.0,2004.0,2010.0
MasVnrArea,2896.0,102.201312,179.334253,0.0,0.0,0.0,164.0,1600.0
BsmtFinSF1,2918.0,441.423235,455.610826,0.0,0.0,368.5,733.0,5644.0


In [6]:
# Calculates the total number of values per column
total_values = len(df_combine)

# Calculates the count of null values in each column
null_count = df_combine.isnull().sum()

# Calculates the percentage of null values in each column
null_percentage = (null_count / total_values) * 100

# Filters and displays only the columns with any missing value (percentage different from zero)
filtered_null_percentage = null_percentage[null_percentage > 0]
filtered_null_percentage.round(2)

MSZoning         0.14
LotFrontage     16.65
Alley           93.22
Utilities        0.07
Exterior1st      0.03
Exterior2nd      0.03
MasVnrType      60.50
MasVnrArea       0.79
BsmtQual         2.77
BsmtCond         2.81
BsmtExposure     2.81
BsmtFinType1     2.71
BsmtFinSF1       0.03
BsmtFinType2     2.74
BsmtFinSF2       0.03
BsmtUnfSF        0.03
TotalBsmtSF      0.03
Electrical       0.03
BsmtFullBath     0.07
BsmtHalfBath     0.07
KitchenQual      0.03
Functional       0.07
FireplaceQu     48.65
GarageType       5.38
GarageYrBlt      5.45
GarageFinish     5.45
GarageCars       0.03
GarageArea       0.03
GarageQual       5.45
GarageCond       5.45
PoolQC          99.66
Fence           80.44
MiscFeature     96.40
SaleType         0.03
SalePrice       49.98
dtype: float64

In [7]:
filtered_null_percentage = null_percentage[null_percentage > 40]
filtered_null_percentage.round(2), filtered_null_percentage.index.tolist()

(Alley          93.22
 MasVnrType     60.50
 FireplaceQu    48.65
 PoolQC         99.66
 Fence          80.44
 MiscFeature    96.40
 SalePrice      49.98
 dtype: float64,
 ['Alley',
  'MasVnrType',
  'FireplaceQu',
  'PoolQC',
  'Fence',
  'MiscFeature',
  'SalePrice'])

In [8]:
# Select numeric columns
numeric_columns = df_combine.select_dtypes(include=['int', 'float'])

# List of numeric columns
numeric_column_names = numeric_columns.columns.tolist()

# List of non-numeric columns
non_numeric_column_names = [col for col in df_combine.columns if col not in numeric_column_names]

# print("Numeric columns:", numeric_column_names)
# print("Non-numeric columns:", non_numeric_column_names)

In [9]:
# filtered_values = null_percentage[(null_percentage > 0) & (null_percentage < 40)]
# filtered_values

In [10]:
import plotly.graph_objects as go

# Select only numeric columns
df_numeric = df_combine[numeric_column_names]

# Remove specific columns if needed
df_numeric = df_numeric.drop(['Id', 'SalePrice'], axis=1, errors='ignore')

# Create the boxplot using go.Box with the DataFrame directly
fig = go.Figure(data=[go.Box(y=df_numeric[col], name=col) for col in df_numeric.columns])

# Update layout
fig.update_layout(title='Boxplot of Numeric Variables',
                  xaxis_title='Variables',
                  yaxis_title='Values',
                  autosize=True,  # Set the figure size to the maximum screen size
                  margin=dict(l=0, r=0, t=40, b=0))  # Set the margins

# Show the plot
fig.show()