In [193]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler,MinMaxScaler

from sklearn import metrics
from sklearn.metrics import confusion_matrix

import redis
from kafka import KafkaConsumer
import json

import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

print("library completed")
print("library check.... pass")

library completed
library check.... pass


## Task 1) Preprocess the data

In [194]:
df = pd.read_csv("horse_2.csv")
df

Unnamed: 0,HN,age,rectal_temp,pulse,respiratory_rate,pain,abdominal_distention,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,outcome
0,HN001,adult,38.5,66.0,28.0,extreme_pain,severe,,45.0,8.4,,,no,0
1,HN002,adult,39.2,88.0,20.0,mild_pain,slight,,50.0,85.0,cloudy,2.0,no,0
2,HN003,adult,38.3,40.0,24.0,mild_pain,none,,33.0,6.7,,,no,1
3,HN004,young,39.1,164.0,84.0,depressed,severe,5.0,48.0,7.2,serosanguious,5.3,yes,0
4,HN005,adult,37.3,104.0,35.0,,,,74.0,7.4,,,no,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,HN295,adult,,120.0,70.0,depressed,,,55.0,65.0,,,no,0
295,HN296,adult,37.2,72.0,24.0,severe_pain,moderate,,44.0,,serosanguious,3.3,yes,0
296,HN297,adult,37.5,72.0,30.0,severe_pain,moderate,,60.0,6.8,,,yes,0
297,HN298,adult,36.5,100.0,24.0,mild_pain,moderate,,50.0,6.0,serosanguious,3.4,yes,1


In [195]:
df.isna().sum() > 149

HN                       False
age                      False
rectal_temp              False
pulse                    False
respiratory_rate         False
pain                     False
abdominal_distention     False
nasogastric_reflux_ph     True
packed_cell_volume       False
total_protein            False
abdomo_appearance         True
abdomo_protein            True
surgical_lesion          False
outcome                  False
dtype: bool

In [196]:
col_to_drop = df.columns[df.isna().sum() > 149]
print("Number of Column to drop:", len(col_to_drop))

Number of Column to drop: 3


In [197]:
df.drop(['nasogastric_reflux_ph','abdomo_appearance','abdomo_protein'],axis=1, inplace=True)
df.shape

(299, 11)

In [198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   HN                    299 non-null    object 
 1   age                   299 non-null    object 
 2   rectal_temp           239 non-null    float64
 3   pulse                 275 non-null    float64
 4   respiratory_rate      241 non-null    float64
 5   pain                  244 non-null    object 
 6   abdominal_distention  243 non-null    object 
 7   packed_cell_volume    270 non-null    float64
 8   total_protein         266 non-null    float64
 9   surgical_lesion       299 non-null    object 
 10  outcome               299 non-null    int64  
dtypes: float64(5), int64(1), object(5)
memory usage: 25.8+ KB


## Task 2) 2.	Impute missing values

In [199]:
from sklearn.impute import SimpleImputer

numeric_columns = df.select_dtypes(include=['number']).columns
print("Numeric Columns:",numeric_columns)

num_imp=SimpleImputer(missing_values=np.NaN, strategy='mean')
num_imp

Numeric Columns: Index(['rectal_temp', 'pulse', 'respiratory_rate', 'packed_cell_volume',
       'total_protein', 'outcome'],
      dtype='object')


In [200]:
df[df["pulse"].isna()]

Unnamed: 0,HN,age,rectal_temp,pulse,respiratory_rate,pain,abdominal_distention,packed_cell_volume,total_protein,surgical_lesion,outcome
5,HN006,adult,,,,depressed,slight,,,no,1
28,HN029,adult,,,,,,,,no,0
52,HN053,adult,,,,alert,none,43.0,7.7,no,1
56,HN057,adult,,,,,,24.0,6.7,yes,1
58,HN059,adult,,,20.0,extreme_pain,moderate,53.0,5.9,yes,0
74,HN075,young,,,,,,37.0,4.9,yes,0
78,HN079,adult,,,,extreme_pain,moderate,46.0,5.9,yes,0
83,HN084,adult,38.0,,24.0,extreme_pain,severe,68.0,7.8,yes,0
93,HN094,adult,,,,extreme_pain,moderate,38.0,6.5,yes,0
115,HN116,adult,,,40.0,mild_pain,slight,45.0,70.0,no,1


In [201]:
filtered_loans_num = df[numeric_columns]

df[numeric_columns]=pd.DataFrame(num_imp.fit_transform(filtered_loans_num))
display(df.iloc[28].pulse)

72.0

## Task 3) Create Dummy

In [202]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore') # this feature will be all zeros
# passing bridge-types-cat column (label encoded values of bridge_types)
nominal_columns = ['pain', 'abdominal_distention', 'age', 'surgical_lesion']
print(enc.fit_transform(df[nominal_columns]).toarray().shape)
enc_df = pd.DataFrame(enc.fit_transform(df[nominal_columns]).toarray())
enc_df

(299, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
295,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
296,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
297,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


## Task 4) Train/Test split 

In [203]:
cleaned_df = pd.read_csv("cleaned_horse_2.csv")
cleaned_df

Unnamed: 0,HN,rectal_temp,pulse,respiratory_rate,packed_cell_volume,total_protein,pain_depressed,pain_extreme_pain,pain_mild_pain,pain_severe_pain,abdominal_distention_none,abdominal_distention_severe,abdominal_distention_slight,age_young,surgical_lesion_yes,outcome
0,HN001,38.5,66,28,45.0,8.4,0,1,0,0,0,1,0,0,0,0
1,HN002,39.2,88,20,50.0,85.0,0,0,1,0,0,0,1,0,0,0
2,HN003,38.3,40,24,33.0,6.7,0,0,1,0,1,0,0,0,0,1
3,HN004,39.1,164,84,48.0,7.2,1,0,0,0,0,1,0,1,1,0
4,HN005,37.3,104,35,74.0,7.4,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,HN295,38.0,120,70,55.0,65.0,1,0,0,0,1,0,0,0,0,0
295,HN296,37.2,72,24,44.0,24.0,0,0,0,1,0,0,0,0,1,0
296,HN297,37.5,72,30,60.0,6.8,0,0,0,1,0,0,0,0,1,0
297,HN298,36.5,100,24,50.0,6.0,0,0,1,0,0,0,0,0,1,1


In [204]:
cleaned_df.index = cleaned_df["HN"]
cleaned_df.drop("HN",inplace=True,axis=1)

cleaned_df

Unnamed: 0_level_0,rectal_temp,pulse,respiratory_rate,packed_cell_volume,total_protein,pain_depressed,pain_extreme_pain,pain_mild_pain,pain_severe_pain,abdominal_distention_none,abdominal_distention_severe,abdominal_distention_slight,age_young,surgical_lesion_yes,outcome
HN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
HN001,38.5,66,28,45.0,8.4,0,1,0,0,0,1,0,0,0,0
HN002,39.2,88,20,50.0,85.0,0,0,1,0,0,0,1,0,0,0
HN003,38.3,40,24,33.0,6.7,0,0,1,0,1,0,0,0,0,1
HN004,39.1,164,84,48.0,7.2,1,0,0,0,0,1,0,1,1,0
HN005,37.3,104,35,74.0,7.4,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HN295,38.0,120,70,55.0,65.0,1,0,0,0,1,0,0,0,0,0
HN296,37.2,72,24,44.0,24.0,0,0,0,1,0,0,0,0,1,0
HN297,37.5,72,30,60.0,6.8,0,0,0,1,0,0,0,0,1,0
HN298,36.5,100,24,50.0,6.0,0,0,1,0,0,0,0,0,1,1


In [207]:
from sklearn.model_selection import train_test_split

X = cleaned_df.drop('outcome', axis=1)
y = cleaned_df['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2024, stratify=y)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 209
Testing set size: 90


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [226]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators = 100, min_samples_leaf=7, max_depth = 5, random_state = 2024)
rf_classifier.fit(X_train, y_train)

In [229]:
rf_classifier.feature_importances_

array([0.09719539, 0.25277713, 0.04690264, 0.19543382, 0.05681744,
       0.0095062 , 0.05301948, 0.04501381, 0.08815997, 0.01470642,
       0.01387495, 0.02056337, 0.0048064 , 0.10122298])