In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import pandas_profiling
from sklearn.model_selection import train_test_split, KFold
import xgboost as xb
import lightgbm as lbm
from catboost import Pool, CatBoostClassifier, CatBoost
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from collections import defaultdict
imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.isnull().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [5]:
# type_a = train[train['Hospital_code']==4]
# type_a.nunique()

In [6]:
train.nunique()

case_id                              318438
Hospital_code                            32
Hospital_type_code                        7
City_Code_Hospital                       11
Hospital_region_code                      3
Available Extra Rooms in Hospital        18
Department                                5
Ward_Type                                 6
Ward_Facility_Code                        6
Bed Grade                                 4
patientid                             92017
City_Code_Patient                        37
Type of Admission                         3
Severity of Illness                       3
Visitors with Patient                    28
Age                                      10
Admission_Deposit                      7300
Stay                                     11
dtype: int64

In [7]:
train_x = train.drop('Stay', axis=1)
train_y = train['Stay']
test_x = test
train_x.shape, train_y.shape, test_x.shape

((318438, 17), (318438,), (137057, 17))

In [8]:
le = LabelEncoder()
train_y = le.fit_transform(train_y)

In [9]:
df = train_x.append(test_x)

In [10]:
df["Bed Grade"] = imputer.fit_transform(df[["Bed Grade"]]).ravel()
df["City_Code_Patient"] = imputer.fit_transform(df[["City_Code_Patient"]]).ravel()

In [11]:
df['grouped'] = df['Hospital_code'].astype(str) + df['Hospital_type_code'] + df['City_Code_Hospital'].astype(str)\
                     + df['Hospital_region_code'] + df['Ward_Facility_Code']

In [12]:
df.drop(['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 
        'Ward_Facility_Code', 'case_id', 'patientid'], axis=1, inplace=True)

In [13]:
categorical_features_names = ['Available Extra Rooms in Hospital', 'Department', 'Ward_Type',
       'Bed Grade', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age', 'grouped']
# df[categorical_features_names] = df[categorical_features_names].astype(str)

In [14]:
le2 = LabelEncoder()
for col in categorical_features_names:
    df[col] = le2.fit_transform(df[col])

In [15]:
transformer = RobustScaler(quantile_range=(25, 75))
df[['Available Extra Rooms in Hospital', 'Admission_Deposit']] =  \
    transformer.fit_transform(df[['Available Extra Rooms in Hospital', 'Admission_Deposit']])

In [16]:
df.tail()

Unnamed: 0,Available Extra Rooms in Hospital,Department,Ward_Type,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,grouped
137052,0.5,1,1,2,2,0,1,4,4,1.289926,1
137053,-0.5,3,2,3,6,0,2,2,0,-1.005733,16
137054,-0.5,1,2,3,11,2,1,2,0,2.00819,22
137055,-0.5,1,2,3,9,1,1,2,4,0.570844,27
137056,0.0,2,1,3,2,1,0,5,5,-0.029484,28


In [17]:
train_df = df.iloc[:318438, :]
test_df = df.iloc[318438:, :]

In [18]:
train_df['Stay'] = train_y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
cat_features = [1,2,3,4,5,6,7,8,10]

In [20]:
# from sklearn.utils import class_weight
# class_weights = class_weight.compute_class_weight('balanced',
#                                                  np.unique(train_df['Stay']),
#                                                  train_df['Stay'])
# class_weights

In [21]:
model = CatBoostClassifier(loss_function="MultiClass",
                           eval_metric="Accuracy",
                           task_type="GPU",
                           learning_rate=0.01,
                           iterations=20000,
                           l2_leaf_reg=50,
                           random_seed=432013,
                           od_type="Iter",
                           depth=8,
                           early_stopping_rounds=15000,
                           border_count=100, 
                           one_hot_max_size=50 
#                            class_weights = class_weights
                           #has_time= True 
                          )

In [22]:
n_split = 10
kf = KFold(n_splits=n_split, random_state=432013, shuffle=True)

In [23]:
train_df.head()

Unnamed: 0,Available Extra Rooms in Hospital,Department,Ward_Type,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,grouped,Stay
0,0.0,3,2,1,6,0,0,2,5,0.141687,30,0
1,-0.5,3,3,1,6,1,0,2,5,0.995905,21,4
2,-0.5,1,3,1,6,1,0,2,5,0.005733,0,3
3,-0.5,3,2,1,6,1,0,2,5,2.075348,17,4
4,-0.5,3,3,1,6,1,0,2,5,0.671581,17,4


In [None]:
for idx, (train_index, valid_index) in enumerate(kf.split(train_df)):
    y_train, y_valid = train_df.Stay.iloc[train_index], train_df.Stay.iloc[valid_index]
    X_train, X_valid = train_df.drop('Stay', 1).iloc[train_index,:], train_df.drop('Stay', 1).iloc[valid_index,:]
    _train = Pool(X_train, label=y_train, cat_features=cat_features)
    _valid = Pool(X_valid, label=y_valid, cat_features=cat_features)
    print( "\nFold ", idx)
    fit_model = model.fit(_train,
                          eval_set=_valid,
                          use_best_model=True,
                          verbose=1000,
                         )

In [None]:
model.get_best_score()

In [None]:
test_dataset = Pool(test_df, cat_features=cat_features)
y_pred = model.predict(test_dataset)

In [None]:
classes = le.inverse_transform(y_pred)

In [None]:
np.unique(classes)

In [None]:
output = pd.DataFrame(test['case_id'].values,columns=['case_id'])
output['Stay'] = classes

In [None]:
output.head()

In [None]:
output.to_csv('Catboost_cv.csv',index=False)

In [None]:
y_probabilites = model.predict_proba(test_dataset)

In [None]:
class_prob = pd.DataFrame(y_probabilites)

In [None]:
class_prob

In [None]:
class_prob.to_csv('Class_prob_catboost1.csv', index=False)