In [4]:
import os
import sys

from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from lightgbm import LGBMClassifier


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from typing import Dict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import model_functions as mf 
# import lightgbm

from tqdm.auto import tqdm

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import warnings
import gc
import os
import re
import sys
from collections import Counter
import subprocess
from sklearn.utils import shuffle
# from pandas import shuffle

pd.set_option("display.max_rows", 500)
pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 500)

current_user = os.environ.get('USER')

warnings.filterwarnings("ignore")
np.random.seed(42)
tqdm.pandas()

In [5]:
# import yaml

# with open('/configs/config.yaml', 'r') as f:
#     config = yaml.safe_load(f)

## Load data

In [6]:
os.chdir('..')

df= pd.read_csv('dataset.csv')
df = shuffle(df,random_state=42)


sym_des = pd.read_csv('symptom_Description.csv')
sym_pre = pd.read_csv('symptom_precaution.csv')
df1 = pd.read_csv('Symptom-severity.csv')

## Processing data

### Main table

In [7]:
for col in df.columns:
    df[col] = df[col].str.replace('_',' ')

In [8]:
cols = df.columns
data = df[cols].values.flatten()

s = pd.Series(data)
s = s.str.strip()
s = s.values.reshape(df.shape)

df = pd.DataFrame(s, columns=df.columns)

In [9]:
df = df.fillna(0)

In [10]:
df1['Symptom'] = df1['Symptom'].str.replace('_',' ')

In [11]:
vals = df.values
symptoms = df1['Symptom'].unique()

for i in range(len(symptoms)):
    vals[vals == symptoms[i]] = df1[df1['Symptom'] == symptoms[i]]['weight'].values[0]
    
d = pd.DataFrame(vals, columns=cols)

d = d.replace('dischromic  patches', 0)
d = d.replace('spotting  urination',0)
df = d.replace('foul smell of urine',0)

## Train LightGBM

### Model hyperparameters

In [19]:
# multiclass vs multiclassova Objective
lgb_params = {
        "objective": "multiclassova",
        # "metric": "cross_entropy",
        "max_bin": 5,
        "max_depth" : 2,
        "num_leaves": 5,
        "min_data_in_leaf" : 5,
        "learning_rate": 0.2,
        "bagging_fraction": 0.7,
        "feature_fraction": 0.5,
        "bagging_seed": 2018,
        "verbosity": -1
    }

### Train

In [13]:
data = df.iloc[:,1:].values
labels = df['Disease'].values

In [14]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size = 0.8,random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3936, 17) (984, 17) (3936,) (984,)


In [20]:
import pandas as pd
import numpy as np
from typing import Dict
from dataclasses import dataclass

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import lightgbm
from lightgbm import LGBMClassifier



@dataclass
class TrainTestData:
    X_train: pd.DataFrame
    X_valid: pd.DataFrame
    y_train: pd.Series
    y_valid: pd.Series


class LGBModel:
    
    def __init__(self, lgb_params: Dict, test_size: float=.33):
        self.test_size = test_size
        self.lgb_params = lgb_params
        
        self.model = None       
    
    def _split_train_test(self, X: pd.DataFrame, y: pd.Series) -> TrainTestData:
        X_train, X_valid, y_train, y_valid = \
                    train_test_split(X, y, test_size=self.test_size, random_state=42)
        
        return TrainTestData(X_train, X_valid, y_train, y_valid)
    
    def predict(self, X: pd.DataFrame) -> np.array:
        return self.model.predict_proba(X)[:, 1]
    
    def calc_auc(self, X: pd.DataFrame, y: pd.Series) -> float:
        predictions = self.predict(X)
        return roc_auc_score(y, predictions, multi_class="ovr", average='micro')
        
    
    def fit(self, X: pd.DataFrame, y: pd.Series):
        
        data = self._split_train_test(X, y)
        
        model = LGBMClassifier(n_estimators = 10000, **self.lgb_params)
        model.fit(data.X_train, data.y_train, 
                  eval_set=[(data.X_train, data.y_train), (data.X_valid, data.y_valid)],)
                #   eval_names = ['train', 'valid'],)
                #   eval_metric='cross_entropy',)
                #   callbacks=[lightgbm.early_stopping(100), lightgbm.log_evaluation(100)])
        
        self.model = model
    
        # auc_train = self.calc_auc(data.X_train, data.y_train)
        # auc_test = self.calc_auc(data.X_valid, data.y_valid)
        
        # print(f"\n\nauc_train = {auc_train:.3f}\nauc_test = {auc_test:.3f}\n")
        
        self._y_valid = data.y_valid
        self._pr_valid = self.predict(data.X_valid)
        # self._columns = X.columns
        
    def fit_with_grid_search(self, X: pd.DataFrame, y: pd.Series, param_grid: dict, rc_params: dict):
        
        data = self._split_train_test(X, y)
        
        lg_train = lightgbm.Dataset(data.X_train, label=data.y_train)
        lg_valid = lightgbm.Dataset(data.X_valid, label=data.y_valid)
        
        
        grid_search_res = {}

        model = LGBMClassifier()

        # RandomizedSearchCV, GridSearchCV
        grid_search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid,
            scoring='roc_auc',
            n_iter=20,
            n_jobs=20,
            verbose=-1,
            **rc_params
        )

        grid_search.fit(
            data.X_train, data.y_train
        )
        
        self.model = grid_search.best_estimator_
        
        self._y_valid = data.y_valid
        self._pr_valid = self.predict(data.X_valid)
        self._columns = X.columns

        print(f'Best AUC: {grid_search.best_score_}')
        print('Best params:')
        print(grid_search.best_params_)
        
        auc_train = self.calc_auc(data.X_train, data.y_train)
        auc_test = self.calc_auc(data.X_valid, data.y_valid)
        
        print(f"\n\nauc_train = {auc_train:.3f}\nauc_test = {auc_test:.3f}\n")

    def get_prediction_table(self, num_buck: int=10) -> pd.DataFrame:
        
        buck_df = pd.DataFrame({'y':self._y_valid, 'pr': self._pr_valid})
        # print(buck_df.shape)
        buck_df = buck_df.sort_values('pr', ascending=False).reset_index(drop=True)
        # print(buck_df.head(50))
        buck_df['buck'] = buck_df.index * num_buck // len(buck_df)
        # print(buck_df.iloc[200:250].head(50))
        buck_df = buck_df.groupby('buck').agg({'pr': ['max'], 'y': ['mean', 'size']})
        buck_df.columns = ['max_pr', 'av_target', 'bucket_size']
        
        return buck_df
    
    def get_feature_inportance(self) -> pd.DataFrame:
        
        fi = pd.DataFrame({'fi': self.model.feature_importances_, 'col': self._columns}) \
            .sort_values('fi', ascending=False)
        
        return fi

In [21]:
full_lgb_model = LGBModel(lgb_params=lgb_params, test_size=0.2)
full_lgb_model.fit(data, labels)





# rnd_forest = RandomForestClassifier(random_state=42, max_features='sqrt', n_estimators= 500, max_depth=13)
# rnd_forest.fit(x_train,y_train)
# preds=rnd_forest.predict(x_test)
# print(x_test[0])
# print(preds[0])
# conf_mat = confusion_matrix(y_test, preds)X =
# df_cm = pd.DataFrame(conf_mat, index=df['Disease'].unique(), columns=df['Disease'].unique())
# print('F1-score% =', f1_score(y_test, preds, average='macro')*100, '|', 'Accuracy% =', accuracy_score(y_test, preds)*100)