In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!python3 -m pip install --upgrade nni
#!pip3 install autofeat
# !pip3 install multimodal-transformers

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import nni
from nni.algorithms.feature_engineering.gradient_selector import FeatureGradientSelector
from sklearn.model_selection import train_test_split
#from autofeat import AutoFeatRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, mean_squared_error
sns.set_theme(style="darkgrid")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


In [1]:
df = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv', index_col='Id')
df_test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv', index_col='Id')
df.describe()

In [1]:
df.head()

In [1]:
def get_cat_num_cols(df : pd.DataFrame, verbose: bool = True) -> tuple:
    """function to get categorical, numerical, and ordinal columns from dataframe
    
    Parameters
    ----------
        df : pd.DataFrame
            input dataframe
            
        verbose : bool
            to show individual stats.
    Returns
    -------
        tuple : (cat_cols, num_cols)
    """
    
    cat_cols = [col for col in df.columns if df[col].dtype == "O"]
    num_cols = [col for col in df.columns if col not in cat_cols]
    if verbose:
        print(f"categorical columns: {len(cat_cols)}, numerical_cols: {len(num_cols)}")
    return cat_cols, num_cols

In [1]:
## separate target column from features
y = df.SalePrice
df.drop(['SalePrice'], axis=1, inplace=True)
cat_cols, num_cols = get_cat_num_cols(df)

#### Handling missing values

In [1]:
## Drop columns with #(missing values) > 0.60 * #(total rows) and rows with all missing values
missing_cols_info = df.isnull().sum()
bad_cols = list(filter(lambda col : missing_cols_info[col] > 0.50 * len(df), missing_cols_info.keys()))
print(bad_cols)

## drop bad columns
cleaned_df = df.drop(bad_cols, axis = 1)
cat_cols, num_cols = get_cat_num_cols(cleaned_df)

df_test.drop(bad_cols, axis = 1, inplace = True)

In [1]:
## remove rows with all missing values
cleaned_df.dropna(how = "all", inplace = True)
df_test.dropna(how = "all", inplace = True)
print(len(cleaned_df), len(df_test))

In [1]:
# function for comparing different approaches
def score_dataset(X_train : np.ndarray, X_valid : np.ndarray, y_train : np.ndarray, y_valid : np.ndarray):
    """function to test different feature engineering approaches.
    
    Parameters
    ----------
        X_train : np.ndarray
            numpy array containing training features
        X_valid : np.ndarray
            numpy array containing validation features
        y_train : np.ndarray
            numpy array containing target training values
        y_valid : np.ndarray
            numpy array containing target validation values
    """
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    mse = mean_squared_error(y_valid, preds, squared = False)
    r2 = model.score(X_valid, y_valid) 
    return mse, r2

### Select Best Numerical Features

In [1]:
## seperate numerical cols
num_df = cleaned_df[num_cols]
print(len(num_df.columns))

#### Filling missing values in numerical columns with mean values

In [1]:
print(f"# missing values before: {num_df.isnull().sum().values.sum()}")
num_df[num_cols] = num_df[num_cols].fillna(num_df[num_cols].mean())
print(f"# missing values after: {num_df.isnull().sum().values.sum()}")

In [1]:
## prepare training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(num_df, y, test_size = 0.2, random_state = 37)
print(X_train.shape, X_valid.shape)

In [1]:
## Standardizing numerical features
scaler = StandardScaler()
scaler.fit(X_train)

X_norm_train, X_norm_val = scaler.transform(X_train), scaler.transform(X_valid)
print(X_norm_train.shape, X_norm_val.shape)

##### Use NNI to select top 20 features

In [1]:
num_fgs = FeatureGradientSelector(n_epochs=1, n_features = 20) ## change n_features according to your choice
# fit data
num_fgs.fit(X_norm_train, y_train)
# get improtant features
# will return the index with important feature here.
feature_idx = num_fgs.get_selected_features()
print(f"Feature indices: {feature_idx}")

num_X_final_train, num_X_final_val = pd.DataFrame(X_norm_train[:, feature_idx], columns = np.array(num_cols)[feature_idx]), pd.DataFrame(X_norm_val[:, feature_idx], columns = np.array(num_cols)[feature_idx])
num_X_final_train.index = X_train.index
num_X_final_val.index = X_valid.index
print(num_X_final_train.shape, num_X_final_val.shape)

## Compare 2 Models
* One with all numerical features
* Second with top-20 selected features

In [1]:
mse1, r21 = score_dataset(X_norm_train, X_norm_val, y_train, y_valid)
mse2, r22 = score_dataset(num_X_final_train, num_X_final_val, y_train, y_valid)

print(f"MSE \nWith All Numerical Features: {mse1 :.2f} \
        With Top-20 Features: {mse2 :.2f}")

print(f"R2 \nWith All Numerical Features: {r21 :.2f} \
        With Top-20 Features: {r22 :.2f}")

In [1]:
## apply changes to test data
df_test[num_cols] = df_test[num_cols].fillna(df_test[num_cols].mean())
df_test_num = df_test[num_cols]
final_test_num = scaler.transform(df_test_num)
final_test_num = pd.DataFrame(final_test_num[:, feature_idx], columns = np.array(num_cols)[feature_idx])
final_test_num.index = df_test_num.index
final_test_num.shape

In [1]:
final_test_num

### Selecting Categorical Features

In [1]:
cat_df = cleaned_df[cat_cols]
cat_df.shape

In [1]:
def cat_cols_eda(df : pd.DataFrame, cat_cols):
    """Function to perform analysis on categorical columns.
    
    Parameters
    ----------
        df : pd.DataFrame
            input dataframe
    """
    
    for col in cat_cols:
        print(f"{col}: Unique Values")
        print(df[col].value_counts())
        print("\n")

In [1]:
cat_cols_eda(cat_df, cat_cols)

#### Filling missing values in categorical columns with most frequent

In [1]:
## Categorical Columns
for col in cat_cols:
    missing_values = cat_df[col].isnull().sum()
    if missing_values > 0:
        print(f"{col} : {missing_values}")

In [1]:
## Categorical Columns
cat_df = cat_df.fillna(cat_df.mode().iloc[0])
cat_df.isnull().sum().values.sum()

In [1]:
X_train, X_valid, y_train, y_valid = train_test_split(cat_df, y, test_size = 0.2, random_state = 37)
print(X_train.shape, X_valid.shape)

In [1]:
## find high cardinality categorical columns
high_card_cols = [col for col in cat_cols if cat_df[col].nunique() > 10]

low_card_cols = list(set(cat_cols) - set(high_card_cols))

print(f"High Cardinality Columns: {high_card_cols}")
print(f"Low Cardinality Columns: {low_card_cols}")

In [1]:
encoded_X_train, encoded_X_valid = X_train.copy(), X_valid.copy()
print(encoded_X_train.shape, encoded_X_valid.shape)

In [1]:
## Apply Ordinal Encoder to high cardinality columns
ordinal_encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
encoded_X_train[high_card_cols] = ordinal_encoder.fit_transform(X_train[high_card_cols])
encoded_X_valid[high_card_cols] = ordinal_encoder.transform(X_valid[high_card_cols])
print(encoded_X_train.shape, encoded_X_valid.shape)

In [1]:
encoded_X_train.drop(low_card_cols, axis = 1, inplace = True)
encoded_X_valid.drop(low_card_cols, axis = 1, inplace = True)

In [1]:
## Apply One Hot Encoding to low cardinality columns
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_card_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_card_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
print(OH_cols_train.shape, OH_cols_valid.shape)

In [1]:
## concat One Hot and Ordinal Encoded Dataframes
final_cat_X_train = pd.concat([encoded_X_train, OH_cols_train], axis = 1)
final_cat_X_valid = pd.concat([encoded_X_valid, OH_cols_valid], axis = 1)
print(final_cat_X_train.shape, final_cat_X_valid.shape)

In [1]:
## apply transformations to test data
df_test = df_test.fillna(df_test.mode().iloc[0])
encoded_df_test = df_test[cat_cols].copy()
encoded_df_test[high_card_cols] = ordinal_encoder.transform(df_test[high_card_cols])
encoded_df_test.drop(low_card_cols, inplace = True, axis = 1)
OH_df_test_cat = pd.DataFrame(OH_encoder.transform(df_test[low_card_cols]))
OH_df_test_cat.index = df_test.index

final_cat_test = pd.concat([encoded_df_test, OH_df_test_cat], axis = 1)
print(final_cat_test.shape)

### Combine final numeric and categorical datasets

In [1]:
## combine training data
final_X_train = pd.concat([num_X_final_train, final_cat_X_train], axis = 1)

## combine validation data
final_X_val = pd.concat([num_X_final_val, final_cat_X_valid], axis = 1)

## combine test data
final_test_data = pd.concat([final_test_num, final_cat_test], axis = 1)

print(final_X_train.shape, final_X_val.shape, final_test_data.shape)

## Train the final model

In [1]:
from sklearn.model_selection import cross_val_score
model = RandomForestRegressor(n_estimators=500, random_state=0)
model.fit(final_X_train, y_train)
preds = model.predict(final_X_val)
mse = mean_squared_error(y_valid, preds, squared = False)
r2 = model.score(final_X_val, y_valid)

mse, r2

### getting predictions for test data

In [1]:
test_preds = model.predict(final_test_data)
# Save test predictions to file
output = pd.DataFrame({'Id': df_test.index,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)