# Lib and Data

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

pd.set_option('display.max_columns',None)

import statsmodels.api as sm
from statsmodels.formula.api import ols

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv',index_col=0)

ct = pd.crosstab(train['hospital_number'],train.outcome).sort_values('died',ascending=False)
ct['Total'] = ct['died']+ct['euthanized']+ct['lived']
ct['Survival_Rate'] = ct['lived'] / ct['Total']
train['Survival_Rate'] = train['hospital_number'].map(ct['Survival_Rate'])
train['hospital_number'] = train['hospital_number'].astype(str)
train.head()

Unnamed: 0_level_0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome,Survival_Rate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,depressed,absent,slight,slight,less_1_liter,6.5,decreased,distend_small,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died,0.066667
1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,2.0,absent,distend_small,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized,0.666667
2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,3.5,,distend_large,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived,1.0
3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,2.0,decreased,distend_small,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived,1.0
4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,alert,hypomotile,none,slight,less_1_liter,7.0,normal,normal,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived,0.833333


In [3]:
pd.set_option('display.max_rows',5)
train.dtypes


surgery           object
age               object
                  ...   
outcome           object
Survival_Rate    float64
Length: 29, dtype: object

# Univariate feature selection

## Chi Square

In [4]:
str_data = train.select_dtypes('object')
str_data.head()

Unnamed: 0_level_0,surgery,age,hospital_number,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,rectal_exam_feces,abdomen,abdomo_appearance,surgical_lesion,cp_data,outcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,yes,adult,530001,cool,reduced,dark_cyanotic,more_3_sec,depressed,absent,slight,slight,less_1_liter,decreased,distend_small,serosanguious,yes,no,died
1,yes,adult,533836,cool,normal,pale_cyanotic,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,absent,distend_small,serosanguious,yes,no,euthanized
2,yes,adult,529812,cool,reduced,pale_pink,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,,distend_large,serosanguious,yes,no,lived
3,yes,adult,5262541,cold,reduced,pale_pink,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,decreased,distend_small,cloudy,yes,yes,lived
4,no,adult,5299629,normal,normal,normal_pink,less_3_sec,alert,hypomotile,none,slight,less_1_liter,normal,normal,cloudy,no,yes,lived


In [5]:
# Create a contingency table

imp = []
nimp = []
for column in str_data.columns[:-1]:
    contingency_table = pd.crosstab(train[column], train['outcome'])
    
    # Chi-square test
    chi2, p, _, _ = stats.chi2_contingency(contingency_table)

    # print(f'column: {column}')
    # print(f"Chi2 value: {chi2}")
    # print(f"P-value: {p}")
    
    if p < 0.05:
        # print(f"There is a significant association between {column} and outcome")
        imp.append(column)
    else:
        # print(f"There is no significant association between {column} and outcome")
        nimp.append(column)

print(f"Imp Columns:-\nTotal: {len(imp)}\nColumns: {imp}")


for i in range(2):
    print()

print(f"Not Imp Columns:-\nTotal: {len(nimp)}\nColumns: {nimp}")


Imp Columns:-
Total: 17
Columns: ['surgery', 'age', 'hospital_number', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data']


Not Imp Columns:-
Total: 0
Columns: []


## Anova

In [18]:
numeric_columns = train.drop(str_data.columns,axis=1)
numeric_columns['outcome']=train.outcome
numeric_columns.head()

Unnamed: 0_level_0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3,Survival_Rate,outcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,38.1,132.0,24.0,6.5,57.0,8.5,3.4,2209,0,0,0.066667,died
1,37.5,88.0,12.0,2.0,33.0,64.0,2.0,2208,0,0,0.666667,euthanized
2,38.3,120.0,28.0,3.5,37.0,6.4,3.4,5124,0,0,1.0,lived
3,37.1,72.0,30.0,2.0,53.0,7.0,3.9,2208,0,0,1.0,lived
4,38.0,52.0,48.0,7.0,47.0,7.3,2.6,0,0,0,0.833333,lived


In [29]:
# One way Anova
from scipy.stats import f_oneway
imp = []
nimp = []

for column in numeric_columns.columns[:-1]:
    groups = [numeric_columns[column][numeric_columns['outcome'] == category] for category in numeric_columns['outcome'].unique()]
    result = f_oneway(*groups)
    p = result[1]
    
    # print(f"p-value: {p}")
    # print(f'Column: {column}')
    if p > 0.05:
        # print('Fail to reject the null hypothesis.')
        nimp.append(column)
    else:
        # print('Reject the null hypothesis.')
        imp.append(column)
        
    # print()


print(f"Imp Columns:-\nTotal: {len(imp)}\nColumns: {imp}")


for i in range(2):
    print()

print(f"Not Imp Columns:-\nTotal: {len(nimp)}\nColumns: {nimp}")


Imp Columns:-
Total: 7
Columns: ['pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein', 'abdomo_protein', 'Survival_Rate']


Not Imp Columns:-
Total: 4
Columns: ['rectal_temp', 'lesion_1', 'lesion_2', 'lesion_3']
