<h1> Loading libraries </h1>

In [1]:
#basic libraries
import numpy as np
import pandas as pd 

#for viewing directories
import os

#some models to try out
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
import tensorflow_decision_forests as tfdf

# Reading files

In [2]:
#viewing directories
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s3e22/sample_submission.csv
/kaggle/input/playground-series-s3e22/train.csv
/kaggle/input/playground-series-s3e22/test.csv


In [3]:
#reading csv files to Pandas dataframes
train = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s3e22/sample_submission.csv')

# Preparing data

In [4]:
#the 'id' column is unnecessary for predictions
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [5]:
#first, the two data files are combined
combined = pd.concat([train, test])

In [6]:
combined.dtypes

surgery                   object
age                       object
hospital_number            int64
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
outcome                   object
dtype: object

In [7]:
combined.select_dtypes('object').nunique()

surgery                  2
age                      2
temp_of_extremities      4
peripheral_pulse         4
mucous_membrane          6
capillary_refill_time    3
pain                     7
peristalsis              5
abdominal_distention     4
nasogastric_tube         3
nasogastric_reflux       4
rectal_exam_feces        5
abdomen                  5
abdomo_appearance        3
surgical_lesion          2
cp_data                  2
outcome                  3
dtype: int64

In [8]:
for col in combined.select_dtypes('object'):
    print(col, train[col].unique())

surgery ['yes' 'no']
age ['adult' 'young']
temp_of_extremities ['cool' 'cold' 'normal' 'warm' nan]
peripheral_pulse ['reduced' 'normal' nan 'absent' 'increased']
mucous_membrane ['dark_cyanotic' 'pale_cyanotic' 'pale_pink' 'normal_pink' 'bright_pink'
 'bright_red' nan]
capillary_refill_time ['more_3_sec' 'less_3_sec' nan '3']
pain ['depressed' 'mild_pain' 'extreme_pain' 'alert' 'severe_pain' nan 'slight']
peristalsis ['absent' 'hypomotile' 'normal' 'hypermotile' nan 'distend_small']
abdominal_distention ['slight' 'moderate' 'none' 'severe' nan]
nasogastric_tube ['slight' 'none' 'significant' nan]
nasogastric_reflux ['less_1_liter' 'more_1_liter' 'none' nan 'slight']
rectal_exam_feces ['decreased' 'absent' nan 'normal' 'increased' 'serosanguious']
abdomen ['distend_small' 'distend_large' 'normal' 'firm' nan 'other']
abdomo_appearance ['serosanguious' 'cloudy' 'clear' nan]
surgical_lesion ['yes' 'no']
cp_data ['no' 'yes']
outcome ['died' 'euthanized' 'lived']


In [9]:
#checking if certain entries are possibly erroneous
print(len(combined[combined['peristalsis'] == 'distend_small']))
print(len(combined[combined['nasogastric_reflux'] == 'slight']))
print(len(combined[combined['capillary_refill_time'] == '3']))

1
1
6


In [10]:
#label encoding for certain features
combined['temp_of_extremities'] = combined['temp_of_extremities'].map({'cold': 0, 'cool': 1, 'normal': 2, 'warm': 3})
combined['peripheral_pulse'] = combined['peripheral_pulse'].map({'absent': 0, 'reduced': 1, 'normal': 2, 'increased': 3})
combined['capillary_refill_time'] = combined['capillary_refill_time'].map({'less_3_sec': 0, '3': 1, 'more_3_sec': 2})
combined['pain'] = combined['pain'].map({'alert': 0, 'depressed': 1, 'slight': 2, 'mild_pain': 3, 'severe_pain': 4, 'extreme_pain': 5})
combined['peristalsis'] = combined['peristalsis'].map({'absent': 0, 'hypomotile': 1, 'normal': 2, 'hypermotile': 3})
combined['abdominal_distention'] = combined['abdominal_distention'].map({'none': 0, 'slight': 1, 'moderate': 2, 'severe': 3})
combined['nasogastric_tube'] = combined['nasogastric_tube'].map({'none': 0, 'slight': 1, 'significant': 2})
combined['nasogastric_reflux'] = combined['nasogastric_reflux'].map({'none': 0, 'less_1_liter': 1, 'more_1_liter': 2})

In [11]:
combined.nunique()

surgery                    2
age                        2
hospital_number          271
rectal_temp               43
pulse                     51
respiratory_rate          42
temp_of_extremities        4
peripheral_pulse           4
mucous_membrane            6
capillary_refill_time      3
pain                       6
peristalsis                4
abdominal_distention       4
nasogastric_tube           3
nasogastric_reflux         3
nasogastric_reflux_ph     30
rectal_exam_feces          5
abdomen                    5
packed_cell_volume        51
total_protein             85
abdomo_appearance          3
abdomo_protein            60
surgical_lesion            2
lesion_1                  61
lesion_2                   5
lesion_3                   2
cp_data                    2
outcome                    3
dtype: int64

In [12]:
combined['outcome']

0            died
1      euthanized
2           lived
3           lived
4           lived
          ...    
819           NaN
820           NaN
821           NaN
822           NaN
823           NaN
Name: outcome, Length: 2059, dtype: object

In [13]:
outcomes = combined['outcome']
combined = combined.drop('outcome', axis=1)

In [14]:
#dealing with NaN values
#the entries deemed erroneous are now NaN after the label encoding above
for col in range(combined.shape[1]):
    if combined.iloc[:,col].dtypes != object:
        combined.iloc[:,col].fillna(combined.iloc[:,col].mean(), inplace=True)
    else:
        combined.iloc[:,col].fillna(combined.iloc[:,col].mode()[0], inplace=True)

In [15]:
combined.isna().any()

surgery                  False
age                      False
hospital_number          False
rectal_temp              False
pulse                    False
respiratory_rate         False
temp_of_extremities      False
peripheral_pulse         False
mucous_membrane          False
capillary_refill_time    False
pain                     False
peristalsis              False
abdominal_distention     False
nasogastric_tube         False
nasogastric_reflux       False
nasogastric_reflux_ph    False
rectal_exam_feces        False
abdomen                  False
packed_cell_volume       False
total_protein            False
abdomo_appearance        False
abdomo_protein           False
surgical_lesion          False
lesion_1                 False
lesion_2                 False
lesion_3                 False
cp_data                  False
dtype: bool

In [16]:
outcomes = outcomes.map({'died': 0, 'euthanized': 1, 'lived': 2})
outcomes

0      0.0
1      1.0
2      2.0
3      2.0
4      2.0
      ... 
819    NaN
820    NaN
821    NaN
822    NaN
823    NaN
Name: outcome, Length: 2059, dtype: float64

In [17]:
combined = pd.get_dummies(combined, drop_first=True)

In [18]:
combined['outcome'] = outcomes

In [19]:
final_train = combined[combined['outcome'].notna()]
final_test = combined[combined['outcome'].isna()]

# Models and Submissions

## Preparation: Splitting of data

## Model 1: lightgbm.LGBMClassifier
A **gradient-boosting** framework based on decision trees.

In [20]:
X_train = final_train.drop(columns=['outcome'], axis=1)
y_train = final_train['outcome']
X_test = final_test.drop(columns=['outcome'], axis=1)

In [21]:
model1 = LGBMClassifier()
model1.fit(X_train, y_train)
importances = model1.feature_importances_

In [22]:
np.where(importances < importances.mean())[0][0]

4

In [23]:
less_significant = np.where(importances < importances.mean())
final_train1 = final_train
final_test1 = final_test
for i in less_significant[0]:
    final_train1.drop(list(final_train1)[i], axis=1)
    final_test1.drop(list(final_test1)[i], axis=1)
X_train = final_train1.drop(columns=['outcome'], axis=1)
y_train = final_train1['outcome']
X_test = final_test1.drop(columns=['outcome'], axis=1)

In [24]:
model1 = LGBMClassifier()
model1.fit(X_train, y_train)
prediction1 = model1.predict(X_test)

In [25]:
submission1 = submission
submission1['outcome'] = prediction1
submission1['outcome'] = submission1['outcome'].map({0:'died',1:'euthanized',2:'lived'})
submission1.to_csv('submission_lgbm.csv',index=False)
submission1

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived


## Model 2: sklearn.ensemble.RandomForestClassifier
scikit learn's random forest classifier.

In [26]:
X_train = final_train.drop(columns=['outcome'], axis=1)
y_train = final_train['outcome']
X_test = final_test.drop(columns=['outcome'], axis=1)

In [27]:
model2 = RandomForestClassifier(n_estimators=100)
model2.fit(X_train, y_train)
prediction2 = model2.predict(X_test)

In [28]:
submission2 = submission
submission2['outcome'] = prediction2
submission2['outcome'] = submission2['outcome'].map({0:'died',1:'euthanized',2:'lived'})
submission2.to_csv('submission_sklearn.csv',index=False)
submission2

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived


## Model 3: tfdf.keras.RandomForestModel
Tensorflow's random forest model.

In [29]:
def bool_to_int(df, view=False):
    for col in range(df.shape[1]):
        if df.iloc[:,col].dtypes == bool:
            df.iloc[:,col] = df.iloc[:,col].map({False: 0, True: 1})
    if view:
        return df
bool_to_int(final_train)
bool_to_int(final_test)

In [30]:
X_train = final_train.drop(columns=['outcome'], axis=1)
y_train = final_train['outcome']
X_test = final_test.drop(columns=['outcome'], axis=1)

In [31]:
tf_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(final_train, label='outcome')
model3 = tfdf.keras.RandomForestModel()
model3.fit(tf_dataset)

Use /tmp/tmpmqpjcvpu as temporary training directory
Reading training dataset...
Training dataset read in 0:00:05.075987. Found 1235 examples.
Training model...


[INFO 23-10-01 23:06:37.5665 UTC kernel.cc:1243] Loading model from path /tmp/tmpmqpjcvpu/model/ with prefix dedbae8f386d4816


Model trained in 0:00:02.024928
Compiling model...


[INFO 23-10-01 23:06:37.9823 UTC decision_forest.cc:660] Model loaded with 300 root(s), 83872 node(s), and 37 input feature(s).
[INFO 23-10-01 23:06:37.9824 UTC abstract_model.cc:1311] Engine "RandomForestGeneric" built
[INFO 23-10-01 23:06:37.9824 UTC kernel.cc:1075] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.


<keras.callbacks.History at 0x7918e8a01780>

In [32]:
tf_test = tfdf.keras.pd_dataframe_to_tf_dataset(X_test)
prediction3 = model3.predict(tf_test)



In [33]:
temp = []
for i in range(len(prediction3)):
    temp.append([np.argmax(prediction3[i])])
tempdf = pd.DataFrame(temp, columns = ['outcome'])

In [34]:
submission3 = submission
submission3['outcome'] = tempdf
submission3['outcome'] = submission3['outcome'].map({0:'died',1:'euthanized',2:'lived'})
submission3.to_csv('submission_tf.csv',index=False)
submission3

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived
