I already created some machine learning models in this repo but I haven't say anything about what columns are important. That is a very important aspects that we need to address, when we understand what column are important we can:
+ trust the result of the model
+ take inform decision
+ in case we need to collect new data (Generally expensive), we know what variables play a key role and thus we need to collect.

So this is the purpose of this notebook and the next in this folder!

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

### How we are going to train this data

Previously we got the best model with n_estimators=369 when we use the **whole training data using cross validation**.
We could improve the model even further but instead of doing that our focus is in getting the modest important variables.

We are going to keep this hyperparameter but we are going to train a model with less data, because we are going to separate in the training set in training and validation.


In [2]:
# Read the data
# This data you can find here: https://www.kaggle.com/c/home-data-for-ml-course/data

X_full = pd.read_csv('train.csv', index_col='Id')

# SalePrice is the target, if there is no target eliminate row associated with it
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X = X_full.copy()
X.drop(['SalePrice'], axis=1, inplace=True)


The next cell just uses the same function that we saw in the "(1) Predicting_a_numerical_value"

In [3]:
def cleaning_data(X,delete_over=10,col_to_change_to_null=0.1):
    
    categorical_variables = [col for col in  X.columns if str(X[col].dtypes)=='object']
    #numerical_variables = [col for col in X.columns if str(X[col].dtypes)!='object']
    cardinalidad = {}
    for col in categorical_variables:
        cardinalidad[col] = len(list(X[col].unique()))
    # For now we delete categories with more values than..
    delete_over = delete_over
    columns_to_delete = [col for col in categorical_variables if len(list(X[col].unique()))>delete_over ]
    X.drop(columns=columns_to_delete,inplace = True, axis=1)
    
    # We are going to change columns with too many null.
    # We are not gonna delete them, will give them the chance to be important.
    # that means that having or not having the value is what is really important.
    col_to_change_to_null = col_to_change_to_null
    columnas_modificar_por_1 = [col for col in X.columns if X[col].isnull().sum()>int(X.shape[0] * col_to_change_to_null) ]

    for col in columnas_modificar_por_1:
        X[col +str('_is_null')] = 0
        X.loc[(X[col].isnull()),col +str('_is_null')] = 1

    new_columns_null = [str(f"{col}_is_null") for col in columnas_modificar_por_1 ]    
    X.drop(columns=columnas_modificar_por_1, axis=1,inplace=True)
    
    return X

In [4]:
X = cleaning_data(X)
X, X_val, y, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                random_state=0)  

In [5]:
def transforming_modeling_scoring(X,y, n_estimators=369,n_jobs=-1):
      
    numerical_col = [col for col in X.columns if str(X[col].dtypes)!='object' ]
    numerical_col_imputed = [col for col in numerical_col if X[col].isnull().any()==True]

    categorical_col = [col for col in X.columns if str(X[col].dtypes)=='object' ]
    categorical_col_imputed = [col for col in categorical_col if X[col].isnull().any()==True]

    numerical_transformer = SimpleImputer(strategy='mean')

    categorical_transformer =  Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=
        [("numerical_transformer", numerical_transformer, numerical_col_imputed),
        ("categorical_transformer", categorical_transformer, categorical_col)],remainder='passthrough')

    # Define model
    model = RandomForestRegressor(n_estimators=n_estimators,random_state=0,n_jobs=n_jobs)

    # Bundle preprocessing and modeling code in a pipeline
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)
                         ])

    model = pipe.fit(X,y)
    
    return model

In [6]:
model = transforming_modeling_scoring(X,y, n_estimators=369)

In [7]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model, random_state=0).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

ImportError: cannot import name 'if_delegate_has_method' from 'sklearn.utils.metaestimators' (C:\Users\Rober\anaconda3\Lib\site-packages\sklearn\utils\metaestimators.py)

In [None]:
#pip install eli5

In [None]:
#conda install -c conda-forge eli5

In [None]:
#pip install scikit-learn