In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [2]:
train_data = pd.read_csv('data/train_cleaned.csv', index_col=0)
test_data = pd.read_csv('data/test_cleaned.csv', index_col=0)

numerical_columns = ['rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1']
categorical_columns = ['surgery', 'age','surgical_lesion','cp_data', 'temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance']

In [3]:
X = train_data.drop(columns=['outcome'])
y = train_data['outcome']

In [4]:
scaler = StandardScaler().fit(X[numerical_columns])

X[numerical_columns] = scaler.transform(X[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

In [5]:
model = RandomForestClassifier(random_state=42, bootstrap=True, max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=6, n_estimators=300)

model.fit(X, y)

In [6]:
prediction = model.predict(test_data)

prediction

array(['lived', 'died', 'lived', 'euthanized', 'lived', 'died', 'died',
       'died', 'lived', 'lived', 'died', 'lived', 'lived', 'lived',
       'died', 'lived', 'lived', 'died', 'died', 'died', 'died', 'died',
       'died', 'lived', 'died', 'lived', 'died', 'euthanized', 'lived',
       'died', 'lived', 'died', 'died', 'lived', 'lived', 'died', 'lived',
       'lived', 'died', 'lived', 'died', 'euthanized', 'died', 'lived',
       'died', 'lived', 'died', 'lived', 'lived', 'died', 'died', 'lived',
       'died', 'died', 'lived', 'lived', 'died', 'died', 'died', 'died',
       'died', 'died', 'died', 'died', 'died', 'lived', 'euthanized',
       'died', 'lived', 'lived', 'died', 'died', 'lived', 'lived',
       'euthanized', 'died', 'euthanized', 'lived', 'died', 'died',
       'died', 'euthanized', 'lived', 'euthanized', 'lived', 'lived',
       'lived', 'euthanized', 'euthanized', 'lived', 'euthanized',
       'lived', 'died', 'euthanized', 'lived', 'euthanized', 'lived',
       '

In [7]:
submission = pd.read_csv('data/sample_submission.csv')

submission.info

<bound method DataFrame.info of        id outcome
0    1235   lived
1    1236   lived
2    1237   lived
3    1238   lived
4    1239   lived
..    ...     ...
819  2054   lived
820  2055   lived
821  2056   lived
822  2057   lived
823  2058   lived

[824 rows x 2 columns]>

In [9]:
submission['outcome'] = prediction
submission.to_csv('data/submission.csv', index=False)
submission = pd.read_csv('data/submission.csv')
submission

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived
