#Classifying Patient on Heart Attack Complication (Arrhythmia) Risk

> *Topic* : Random Forest Classifier for Feature Selection and KNN Algorithm

> *Student* : Sasha A.



##I. Model Creation

###Part 1 : Importing Libraries

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score


###Part 2 : Data Acquisition

In [None]:
filename = '/content/Myocardial infarction complications Database.csv'

In [None]:
data = pd.read_csv(filename)

# Define features and target variable
features = data.columns[2:113]  # Assuming features 2-112 are input data
target = 'FIBR_JELUD'

###Part 3 : Data Pre-processing

In [None]:
# Display basic information about the data
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Columns: 124 entries, ID to LET_IS
dtypes: float64(110), int64(14)
memory usage: 1.6 MB
None
   ID   AGE  SEX  INF_ANAM  STENOK_AN  FK_STENOK  IBS_POST  IBS_NASL   GB  \
0   1  77.0    1       2.0        1.0        1.0       2.0       NaN  3.0   
1   2  55.0    1       1.0        0.0        0.0       0.0       0.0  0.0   
2   3  52.0    1       0.0        0.0        0.0       2.0       NaN  2.0   
3   4  68.0    0       0.0        0.0        0.0       2.0       NaN  2.0   
4   5  60.0    1       0.0        0.0        0.0       2.0       NaN  3.0   

   SIM_GIPERT  ...  JELUD_TAH  FIBR_JELUD  A_V_BLOK  OTEK_LANC  RAZRIV  \
0         0.0  ...          0           0         0          0       0   
1         0.0  ...          0           0         0          0       0   
2         0.0  ...          0           0         0          0       0   
3         0.0  ...          0           0         0          0       0   


In [None]:
X = data[features]
y = data[target]

In [None]:
# Handle missing values temporarily for feature selection
X = X.fillna(X.mean())

###Part 4 : Feature Selection/Extraction

In [None]:
# Train a random forest classifier to determine feature importance
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X, y)

# Select the top 10 features
selector = SelectFromModel(forest, max_features=10, prefit=True)
X_reduced = selector.transform(X)
selected_features = X.columns[selector.get_support()]




In [None]:
selected_features

Index(['S_AD_KBRIG', 'S_AD_ORIT', 'D_AD_ORIT', 'K_BLOOD', 'NA_BLOOD',
       'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE', 'LID_S_n'],
      dtype='object')

In [None]:
from sklearn.impute import SimpleImputer

# Select only the reduced set of features
X = data[selected_features]

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Verify that missing values are handled
print(pd.DataFrame(X_imputed, columns=selected_features).isnull().sum())

S_AD_KBRIG    0
S_AD_ORIT     0
D_AD_ORIT     0
K_BLOOD       0
NA_BLOOD      0
ALT_BLOOD     0
AST_BLOOD     0
L_BLOOD       0
ROE           0
LID_S_n       0
dtype: int64


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Display the encoded classes
print("Classes:", label_encoder.classes_)

Classes: [0 1]


###Part 5 : Training/Testing KNN Classifier Model

In [None]:
from sklearn.model_selection import train_test_split

#Splitting the train/test data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_encoded, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

#Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train the k-NN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # Tune this hyperparameter
knn.fit(X_train_scaled, y_train)

# Predict and evaluate the classifier
y_pred = knn.predict(X_test_scaled)


###Part 6 : Evaluation

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.9529411764705882


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

# Perform a grid search with cross-validation
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best model and evaluate it
best_knn = grid_search.best_estimator_
y_pred_best = best_knn.predict(X_test_scaled)
print('Best Model Accuracy:', accuracy_score(y_test, y_pred_best))

Best Model Accuracy: 0.9529411764705882


##II. User Implementation

In [None]:
# Function to take user input for prediction
def get_user_input():
    user_data = {}
    print("Please enter the following details:")
    for feature in selected_features:
        user_data[feature] = float(input(f"{feature}: "))
    return user_data

# Function to preprocess the user input and make a prediction
def predict_target(user_data):
    # Convert user input to DataFrame
    user_df = pd.DataFrame([user_data])

    # Handle missing values
    user_df_imputed = imputer.transform(user_df)

    # Standardize the features
    user_df_scaled = scaler.transform(user_df_imputed)

    # Predict the target
    prediction = knn.predict(user_df_scaled)
    predicted_class = label_encoder.inverse_transform(prediction)

    return predicted_class[0]


In [None]:
# Main function to run the prediction
if __name__ == "__main__":
    user_data = get_user_input()
    result = predict_target(user_data)
    print(f"The predicted target feature (LET_IS) is: {result}")

Please enter the following details:
S_AD_KBRIG: 120
S_AD_ORIT: 110
D_AD_ORIT: 60
K_BLOOD: 3.9
NA_BLOOD: 136
ALT_BLOOD: 0.15
AST_BLOOD: 0.3
L_BLOOD: 10.7
ROE: 8
LID_S_n: 0
The predicted target feature (LET_IS) is: 0
