# Fast and Accurate Feature Importance using RAPIDS + LOFO

In [None]:
!pip install lofo-importance

In [None]:
import cudf
import cuml


df = cudf.read_csv("/kaggle/input/lish-moa/train_features.csv")

features = ["cp_time", "cp_dose"]

for f in features:
    df[f] = cuml.LabelEncoder().fit_transform(df[f])
    
df = df[df["cp_type"] == "trt_cp"]

target_df = cudf.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv")
targets = [col for col in target_df.columns if col != "sig_id"]

df = df.merge(target_df, on="sig_id")
df.shape

In [None]:
# working on cudf support for LOFO. temporary solution for now:
df = df.to_pandas()

In [None]:
from sklearn.metrics import make_scorer
import numpy as np

def score_func(y_true, y_pred):
    y_true = y_true[0]
    
    y_pred = np.stack([(1 - y[:, 0]) for y in y_pred]).T
    y_pred = np.clip(y_pred, 0.0005, 0.9995)
    return ((y_true*np.log(y_pred)) + (1 - y_true)*np.log(1 - y_pred)).mean()


scorer = make_scorer(score_func, greater_is_better=True, needs_proba=True, needs_threshold=False)

In [None]:
from lofo import LOFOImportance, Dataset, plot_importance
import cuml



gene_features = [col for col in df.columns if col.startswith("g-")]
cell_features = [col for col in df.columns if col.startswith("c-")]

features = gene_features + cell_features

dataset = Dataset(df=df, target=targets, features=features)

lofo_imp = LOFOImportance(dataset, cv=4, scoring=scorer, model=cuml.neighbors.KNeighborsClassifier(n_neighbors=1000))

importance_df = lofo_imp.get_importance()

In [None]:
importance_df.to_csv("feature_importance.csv", index=False)

importance_df.head()

In [None]:
plot_importance(importance_df.head(32), figsize=(8, 12), kind="box")

In [None]:
plot_importance(importance_df.tail(32), figsize=(8, 12), kind="default")