In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import warnings
import gc
import time
import sys
import datetime
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn import metrics
# Plotly library
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
pd.set_option('display.max_columns', 500)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df = pd.read_csv('HCC.csv')
RecS1 = df['RecS1'].astype(str)
df.head()
num_var = df.shape[0]
num_sample = df.shape[1]

df.columns

Index(['ID', 'Age', 'Gender', 'ASJRPD', 'Etiology', 'HCV', 'HBSAG', 'HBSAB',
       'HBEAG', 'HBEAB', 'HBCAG', 'AFP', 'HBVDNA', 'WBC', 'PLT', 'ALB', 'TBIL',
       'GGT', 'ALP', 'ALBI2F', 'RM', 'RT', 'OBL', 'TD', 'TN', 'MVI', 'EG',
       'TC2F', 'SN', 'LC', 'AVT', 'TACE', 'RecS6F', 'RecS1'],
      dtype='object')

In [6]:
hist_rec = [go.Histogram(x=RecS1,
                         xbins=dict(start=-0.5,end=3.5,size=0.5),
                        opacity = 0.8)]
layout = go.Layout(title='RecS1',
                  xaxis=go.layout.XAxis(tickmode = 'array',
                                                tick0 = -0.5,
                                                tickvals = [0,1,2,3]),
                  yaxis=dict(title = 'number'),
                  )
fig = go.Figure(data=hist_rec,layout=layout)
py.iplot(fig,filename='hist_recs1')


Consider using IPython.display.IFrame instead



In [11]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

train = df.iloc[:int(0.8*num_sample),:]
test = df.iloc[int(0.8*num_sample)+1:,:]
target = train['RecS1']

In [12]:
param = {'num_leaves': 60,
         'min_data_in_leaf': 10, 
         'objective':'multiclass',
         'max_depth': -1,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "random_state": 133,
         "verbosity": -1}

all_features = df.columns.drop(['ID'])
categorical_columns = ['Gender', 'ASJRPD', 'Etiology', 'HCV', 'HBSAG', 'HBSAB',
       'HBEAG', 'HBEAB', 'HBCAG', 'ALBI2F', 'RM', 'RT', 'OBL', 'TN', 'MVI', 'EG',
       'TC2F', 'SN', 'LC', 'AVT', 'TACE', 'RecS6F', 'RecS1']


folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
categorical_columns = [c for c in categorical_columns if c not in ['RecS1']]
features = [c for c in train.columns if c not in ['RecS1']]
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()
start_time= time.time()
score = [0 for _ in range(folds.n_splits)]



for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                           label=target.iloc[trn_idx],
                           categorical_feature = categorical_columns
                          )
    val_data = lgb.Dataset(train.iloc[val_idx][features],
                           label=target.iloc[val_idx],
                           categorical_feature = categorical_columns
                          )

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds = 200)
    
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)


    initial_idx = 0
    chunk_size = 100
    current_pred = np.zeros(len(test))
    while initial_idx < test.shape[0]:
        final_idx = min(initial_idx + chunk_size, test.shape[0])
        idx = range(initial_idx, final_idx)
        current_pred[idx] = clf.predict(test.iloc[idx][features], num_iteration=clf.best_iteration)
        initial_idx = final_idx
    predictions += current_pred / min(folds.n_splits, max_iter)
   
    print("time elapsed: {:<5.2}s".format((time.time() - start_time) / 3600))
    score[fold_] = metrics.roc_auc_score(target.iloc[val_idx], oof[val_idx])
    if fold_ == max_iter - 1: break
        
if (folds.n_splits == max_iter):
    print("CV score: {:<8.5f}".format(metrics.roc_auc_score(target, oof)))
else:
     print("CV score: {:<8.5f}".format(sum(score) / max_iter))

fold n°0


AttributeError: module 'lightgbm' has no attribute 'Dataset'