Name: Shobhit Verma\
Class: BE COMPS\
Batch: D\
Roll No: 33 \
UID: 2018130062
# <center> Experiment - 1B </center>

In [65]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.manifold import Isomap
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# **LOADING THE DATAFRAME**

In [2]:
df = pd.read_csv ('../input/horse-colic/horse.csv')

In [3]:
dfTrain,dfTest = train_test_split(df, test_size=0.3, random_state=0)

In [4]:
dfTrain.describe()

# **FEATURE ENGINEERING**

Exploring redundant features and deciding which(if any) must be eliminated

In [5]:
dfTrain.drop(columns = ['cp_data', 'hospital_number'], axis = 1, inplace = True)
dfTest.drop(columns = ['cp_data', 'hospital_number'], axis = 1, inplace = True)

In [6]:
#The features lesion1, 2 and 3 will give us trouble because the way they are formated doesn't make any sense. 
#Therefore, we will create a new feature that counts the number of lesions the horse has to replace them.
dfTrain.loc[dfTrain['lesion_1'] > 0, 'lesion_1'] = 1
dfTrain.loc[dfTrain['lesion_2'] > 0, 'lesion_2'] = 1
dfTrain.loc[dfTrain['lesion_3'] > 0, 'lesion_3'] = 1

dfTrain['num_lesions'] = dfTrain['lesion_1'] + dfTrain['lesion_2'] + dfTrain['lesion_3']
dfTrain = dfTrain.drop(columns = ['lesion_1','lesion_2', 'lesion_3'], axis = 1)
dfTrain['num_lesions'].value_counts()

In [7]:
dfTest.loc[dfTest['lesion_1'] > 0, 'lesion_1'] = 1
dfTest.loc[dfTest['lesion_2'] > 0, 'lesion_2'] = 1
dfTest.loc[dfTest['lesion_3'] > 0, 'lesion_3'] = 1

dfTest['num_lesions'] = dfTest['lesion_1'] + dfTest['lesion_2'] + dfTest['lesion_3']
dfTest = dfTest.drop(columns = ['lesion_1','lesion_2', 'lesion_3'], axis = 1)

In [8]:
def new_punctuation(df): #This manual ordinal encoder was done looking at the documentation to ensure they follow the correct scale (for ex: more pain -> bigger number)
    df['surgery'] = df['surgery'].map({'yes':1,'no':2}).astype('float64')
    df['age'] = df['age'].map({'adult':1,'young':2}).astype('float64')
    df['temp_of_extremities'] = df['temp_of_extremities'].map({'normal':1,'warm':2,'cool':3,'cold':4}).astype('float64')
    df['peripheral_pulse'] = df['peripheral_pulse'].map({'increased':2,'normal':1,'reduced':3,'absent':4}).astype('float64')
    df['mucous_membrane'] = df['mucous_membrane'].map({'dark_cyanotic':6,'bright_red':5,'pale_cyanotic':4,'pale_pink':3,'bright_pink':2,'normal_pink':1}).astype('float64')
    df['capillary_refill_time'] = df['capillary_refill_time'].map({'more_3_sec':3, '3':2, 'less_3_sec':1}).astype('float64')
    df['pain'] = df['pain'].map({'extreme_pain':5, 'severe_pain':4, 'mild_pain':3, 'depressed':2, 'alert':1}).astype('float64')
    df['peristalsis'] = df['peristalsis'].map({'absent':4, 'hypomotile':3, 'normal':2, 'hypermotile':1}).astype('float64')
    df['abdominal_distention'] = df['abdominal_distention'].map({'severe':4,'moderate':3,'slight':2,'none':1}).astype('float64')
    df['nasogastric_tube'] = df['nasogastric_tube'].map({'significant':3, 'slight':2, 'none':1}).astype('float64')
    df['nasogastric_reflux'] = df['nasogastric_reflux'].map({'more_1_liter':3, 'less_1_liter':2, 'none':0}).astype('float64')
    df['rectal_exam_feces'] = df['rectal_exam_feces'].map({'absent':4, 'decreased':3, 'increased':2, 'normal':1}).astype('float64')
    df['abdomen'] = df['abdomen'].map({'distend_large':5, 'distend_small':4, 'firm':3, 'other':2, 'normal':1}).astype('float64')
    df['abdomo_appearance'] = df['abdomo_appearance'].map({'serosanguious':3, 'cloudy':2, 'clear':1}).astype('float64')
    df['outcome'] = df['outcome'].map({'euthanized':3, 'died':2, 'lived':1}).astype('float64')
    df['surgical_lesion'] = df['surgical_lesion'].map({'yes':1,'no':0}).astype('float64')
    return df

In [9]:
dfTrain = new_punctuation(dfTrain)
dfTest = new_punctuation(dfTest)
dfTrain.dtypes

In [10]:
y_test = dfTest['outcome']
dfTest.drop(columns = ['outcome'], axis = 1, inplace = True)

# **DATA VISUALIZATION**

Now lets take a glance at the correlation between features to decide how we will fill the missing values or nulls.

In [11]:
plt.figure(figsize=(24,12))
sns.heatmap(dfTrain.corr(),cmap='magma_r',annot=True)

In [12]:
fig,ax = plt.subplots(3,1,figsize=(15,15))
sns.lineplot(x=dfTrain['rectal_temp'],y=dfTrain.outcome,ax=ax[0],color='r') #We visualize 3 features with different levels of correlation with outcome
sns.lineplot(x=dfTrain['rectal_exam_feces'],y=dfTrain.outcome,ax=ax[1],color='b')
sns.lineplot(x=dfTrain['peripheral_pulse'],y=dfTrain.outcome,ax=ax[2],color='g')

In [13]:
chosen_cols = ['rectal_temp', 'pulse', 'respiratory_rate', 'packed_cell_volume', 'outcome']
sns.pairplot(dfTrain[chosen_cols], hue='outcome', palette = 'viridis'); #We can see a clear correlation for example with high packed cell volume meaning no survival (the same with pulse not with resprate)

In [14]:
sns.pairplot(dfTrain[chosen_cols], kind="kde"); #This graph helps us show where most values are concentrated in some numerical features

In [15]:
sns.countplot(data=dfTrain, x='pain', hue = 'outcome') #We can see how the bigger the pain, the less the chance of survival

In [16]:
sns.countplot(data=dfTrain, x='mucous_membrane', hue = 'outcome') #We can see how the worse the circulation-> less the chance of survival

In [17]:
sns.countplot(data=dfTrain, x='capillary_refill_time', hue = 'outcome') #We can see how the worse the circulation-> less the chance of survival

In [18]:
sns.countplot(data=dfTrain, x='peristalsis', hue = 'outcome') #the lesser the activity on the horses gut the lesser the chance of survival

In [19]:
sns.countplot(data=dfTrain, x='abdominal_distention', hue = 'outcome') #the more distended the abdomen the lesser the chance of survival (it means more pain like the documentation says)

In [20]:
sns.countplot(data=dfTrain, x='age', hue = 'outcome') #surprisingly, the age of the horse doesn't have a big impact on the outcome of the surgery
#however younger horses tend to survive less

# **FILLING MISSING VALUES**

In [21]:
dfTrain.dtypes

In [22]:
dfTrain.isna().sum() #There are no columns that have null values in the test dataframe but not in the train dataframe

In [23]:
#We start finding which columns to eliminate
for col in dfTrain.columns:
  if dfTrain[col].isna().sum() > 120: #More than 50% missing values
    print('Column ' + col + ' --> NULL VALUES: ' + str(dfTrain[col].isna().sum()) + ' --> Correlation with target of ' + str(dfTrain.corr()['outcome'][col]))

In [24]:
#We drop columns nasogastric_reflux_ph and abdomo_protein as they don't hold a significant correlation with the target and they have > 50% of null values in the train dataset
#Filling their null values would cause more harm than good
dfTrain = dfTrain.drop(columns = ['abdomo_protein', 'nasogastric_reflux_ph'], axis = 1)
dfTest = dfTest.drop(columns = ['abdomo_protein', 'nasogastric_reflux_ph'], axis = 1)

In [25]:
dfTrain.shape

In [33]:
plt.figure(figsize=(26,20))
sns.heatmap(dfTrain.isnull(), cbar=False)

In [34]:
#We will use different approaches to fill the missing values on categorical and numerical variables

In [35]:
#We create vectors for the categorical and the numerical features that have missing values
cat_features = ['surgery' , 'age', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 
       'rectal_exam_feces', 'abdomen', 'abdomo_appearance']
       
num_features = ['rectal_temp', 'pulse', 'respiratory_rate', 'packed_cell_volume', 'total_protein']

LETS FIRST FILL THE MISSING VALUES OF THE CATEGORICAL FEATURES

In [36]:
dfTrain.dtypes

In [37]:
#We will use a KNNImputer to imput the categorical features
dfTestOrig = dfTest
KNNimpTR = KNNImputer(n_neighbors=1)
dfTrain = pd.DataFrame(KNNimpTR.fit_transform(dfTrain),columns = dfTrain.columns)
KNNimpTS = KNNImputer(n_neighbors=1)
dfTest = pd.DataFrame(KNNimpTS.fit_transform(dfTest),columns = dfTest.columns)

In [38]:
plt.figure(figsize=(16,8))
sns.heatmap(dfTrain.isnull(), cbar=False) #The last row has null on all the categorical variables so there's no way to fill it with the KNN imputer

In [39]:
dfTrain.isna().sum() #No missing values remain!

# OUTLIER DETECTION

In [44]:
dfTrain[num_features].boxplot(figsize = (20,10))

In [41]:
sns.boxplot(dfTrain['pulse'])

In [42]:
sns.boxplot(dfTrain['respiratory_rate'])

In [None]:
sns.boxplot(dfTrain['packed_cell_volume']) #We can ignore these small outliers

We will delete the outliers

In [45]:
Q1 = dfTrain['respiratory_rate'].quantile(0.15)
Q3 = dfTrain['respiratory_rate'].quantile(0.85)
IQR = Q3 - Q1
big_outliers = dfTrain['respiratory_rate'] > (Q3 + 1.5 * IQR)
dfTrain[big_outliers] 

In [46]:
dfTrain[big_outliers].index

In [47]:
dfTrain.drop(labels=dfTrain[big_outliers].index, axis=0, inplace = True)

In [48]:
Q1 = dfTrain['pulse'].quantile(0.25)
Q3 = dfTrain['pulse'].quantile(0.75)
IQR = Q3 - Q1
big_outliers_2 = dfTrain['pulse'] > (Q3 + 1.5 * IQR)
dfTrain[big_outliers_2] #We will drop these rows with outliers??????????????????????????????????????????????????????????? -> Not for now

In [49]:
dfTrain.drop(labels=dfTrain[big_outliers_2].index, axis=0, inplace = True)

# **NORMALIZATION OF THE VARIABLES**

In [50]:
#We also standardize the categorical variables
target = dfTrain['outcome']
dfTrain = dfTrain.drop(columns = ['outcome'], axis = 1) #We extract the target to avoid standardizing it
features = dfTrain.columns
scaler = StandardScaler()
scaledTrain = scaler.fit_transform(dfTrain)
scaledTest = scaler.transform(dfTest) #we transform the test set with the model trained on the train set
dfTrain = pd.DataFrame(data=scaledTrain, columns=features)
dfTest = pd.DataFrame(data=scaledTest, columns=features) #We rebuild to a dataframe format"""

In [52]:
dfTrain.hist(bins=22, figsize=(20, 20))

# **TRANSFORMATION OF VARIABLES**

In [53]:
print(dfTrain['pulse'].skew()) #We will fix variables with a skewness > 0.5
print(dfTrain['respiratory_rate'].skew())
print(dfTrain['total_protein'].skew())
print(dfTrain['packed_cell_volume'].skew())
print(dfTrain['rectal_temp'].skew())

In [54]:
sns.distplot(dfTrain["pulse"] , color = "b", hist_kws={"alpha": 0.4});

In [55]:
sns.distplot(dfTrain["respiratory_rate"] , color = "b", hist_kws={"alpha": 0.4});

In [56]:
sns.distplot(dfTrain["total_protein"] , color = "b", hist_kws={"alpha": 0.4});

In [57]:
sns.distplot(dfTrain["packed_cell_volume"] , color = "b", hist_kws={"alpha": 0.4}); #This distribution looks less skewed (more gaussian)

In [58]:
cols_not_normal = ['pulse', 'respiratory_rate', 'total_protein']

PT = PowerTransformer()

PTx_train = PT.fit_transform(dfTrain)
PTx_test = PT.transform(dfTest)

dfTrain2 = pd.DataFrame(data=PTx_train, columns=features)
dfTest2 = pd.DataFrame(data=PTx_test, columns=features)

dfTrain[cols_not_normal] = dfTrain2[cols_not_normal]
dfTest[cols_not_normal] = dfTest2[cols_not_normal]

dfTrain.insert(21,"outcome",target.values) #we add the target feature again

In [59]:
dfTrain.hist(bins=22, figsize=(20, 20))

In [60]:
print(dfTrain['pulse'].skew())
print(dfTrain['respiratory_rate'].skew())
print(dfTrain['total_protein'].skew()) #Skewness fixed!

In [61]:
dfTrain.corr()['outcome']['total_protein'] #No correlation!

Since total_protein doesn't improve its skewness (its distribution is totally abnormal) and its correlation with the target is inexistent, we decide to eliminate it.

In [62]:
dfTrain.drop(columns=['total_protein'], axis = 1, inplace = True)
dfTest.drop(columns=['total_protein'], axis = 1, inplace = True)

# **MODEL SELECTION**

In [63]:
y_train = dfTrain['outcome']
x_train = dfTrain.drop(columns = ['outcome'], axis = 1)
x_test = dfTest

**Logistic Regression**

In [66]:
lr = LogisticRegression(random_state=0)
param_grid={"C":np.logspace(-3,3,10)}
grid = GridSearchCV(lr, param_grid, cv=5, verbose=0)
grid_search=grid.fit(x_train, y_train)
print('The best value found for the hyperparameter C is ' + str(grid_search.best_params_['C']))
print('The best result on the training set using 5-Fold CV was ' + str(grid_search.best_score_))
y_pred = grid_search.predict(x_test)
print('The best result predicting the test set was ' + str(accuracy_score(y_test, y_pred)))
confusion_matrix(y_test, y_pred)

## Conclusion

1. Performed Exploratory Data Analysis on horse-colic dataset.
2. Applying logistic regression model on the dataset, I found that:

* The best value found for the hyperparameter C is 0.004641588833612777
* The best result on the training set using 5-Fold CV was 0.6767948717948717
* The best result predicting the test set was 0.5444444444444444

# **THE END!**