## [Leave One Feature Out Importance](https://github.com/aerdem4/lofo-importance)

It is difficult to calculate the feature importances with traditional ways in such high dimensional data. Thanks to LOFO, we can group the features and get one importance value for the whole group. In this notebook, while each loading feature is considered as separate feature, fnc features are considered as one group. Then we calculate the feature importances for each target using a ridge regression model within cross-validation.


In [None]:
!pip install lofo-importance

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import *
from sklearn.model_selection import KFold
from sklearn.linear_model import *
from sklearn.svm import SVR, LinearSVR, NuSVR
from lofo import LOFOImportance, Dataset, plot_importance

loading_df = pd.read_csv("../input/trends-assessment-prediction/loading.csv")
fnc_df = pd.read_csv("../input/trends-assessment-prediction/fnc.csv")

fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:])
df = fnc_df.merge(loading_df, on="Id")


labels_df = pd.read_csv("../input/trends-assessment-prediction/train_scores.csv")
labels_df["is_train"] = True

df = df.merge(labels_df, on="Id", how="left")

test_df = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

df.shape, test_df.shape

In [None]:
def pca_trans(train, test):
    pca = PCA().fit(train)
    plt.figure(figsize=(10, 7))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance')
    plt.show()
    
    ss = StandardScaler()
    train= ss.fit_transform(train)
    test= ss.fit_transform(test)
    
    pca = PCA(n_components=350)
    train = pca.fit_transform(train)
    
    test = pca.fit_transform(test)

    train = pd.DataFrame(train)
    test = pd.DataFrame(test)
    print(train.shape, test.shape)
    return train, test

def scale_data(train, test, convertor=StandardScaler()):
    ss = convertor
    train= ss.fit_transform(train)
    test= ss.fit_transform(test)
    print(train.shape, test.shape)
    print(convertor)
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)
    return train, test

In [None]:
#df.isnull().sum()

In [None]:

#df , test_df = scale_data(df, test_df)

In [None]:

#from sklearn.isotonic import IsotonicRegression

def get_lofo_importance(target):
    cv = KFold(n_splits=5, shuffle=True, random_state=0)

    dataset = Dataset(df=df[df[target].notnull()], target=target, features=loading_features, 
                      feature_groups={"fnc": df[df[target].notnull()][fnc_features].values/500})

    model = NuSVR(nu=0.5, max_iter=-1,kernel='linear',  C=0.1, verbose=True)
    lofo_imp = LOFOImportance(dataset, cv=cv, scoring="neg_mean_absolute_error", model=model)

    return lofo_imp.get_importance()

In [None]:
#plot_importance(get_lofo_importance(target="age"), figsize=(16, 8))

In [None]:
#plot_importance(get_lofo_importance(target="domain1_var1"), figsize=(16, 8))

In [None]:
#plot_importance(get_lofo_importance(target="domain1_var2"), figsize=(16, 8))

In [None]:
plot_importance(get_lofo_importance(target="domain2_var1"), figsize=(16, 8))

In [None]:
plot_importance(get_lofo_importance(target="domain2_var2"), figsize=(16, 8))