<a href="https://colab.research.google.com/github/sebastian-dv/CSE-151A-Project/blob/main/Model%203.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model 3 - Support Vector Machine

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [9]:
df = pd.read_csv('https://archive.ics.uci.edu/static/public/880/data.csv')
df = df[['age','sex','death','dzgroup','scoma','race','sps','aps','diabetes','dementia','meanbp','wblc','hrt','resp','temp','pafi','alb','bili','crea','sod','ph']]

In [10]:
# one hot encoding race
ohe = pd.get_dummies(df[['race']])
df = df.drop(['race'], axis = 1)
df = pd.concat([df, ohe], axis = 1)

# dropping nan values
df = df.dropna(axis = 0, how = 'any')

# encoding sex
df['sex'].replace('female', 0, inplace=True)
df['sex'].replace('male', 1, inplace=True)

# one hot encoding dzgroup
ohe = pd.get_dummies(df[['dzgroup']])
df = df.drop(['dzgroup'], axis = 1)

X = df.copy()
y = ohe.copy()
df = pd.concat([df, ohe], axis = 1)

### StandardScaler

In [11]:
non_num_X = X.iloc[:, 19:]
num_X = X.drop(X.columns[19:], axis=1)
num_X = num_X.drop(columns=['sex'])

scaler = StandardScaler()
StandardScaler()

X_scaled = scaler.fit_transform(num_X)
X_scaled = np.concatenate([X_scaled, non_num_X.values], axis=1)

In [13]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=21)

In [14]:
# C is the SVM regularization parameter
C = 0.1  # able to change
# Create an Instance of SVM and Fit out the data.
# Data is not scaled so as to be able to plot the support vectors
svm = SVC(kernel ='poly', degree = 2)
svm.fit(X_train,y_train.idxmax(axis=1).values)
y_true = y_test.idxmax(axis=1).values
y_pred = svm.predict(X_test)
print(classification_report(y_true, y_pred, zero_division=0))
# Evaluate the accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

                           precision    recall  f1-score   support

dzgroup_ARF/MOSF w/Sepsis       0.52      0.89      0.66       333
              dzgroup_CHF       0.45      0.29      0.35        96
             dzgroup_COPD       0.37      0.23      0.28        83
        dzgroup_Cirrhosis       0.67      0.08      0.15        49
     dzgroup_Colon Cancer       0.00      0.00      0.00        18
             dzgroup_Coma       0.70      0.40      0.51        77
      dzgroup_Lung Cancer       0.32      0.21      0.26        42
     dzgroup_MOSF w/Malig       0.64      0.10      0.17        70

                 accuracy                           0.51       768
                macro avg       0.46      0.28      0.30       768
             weighted avg       0.51      0.51      0.45       768

Accuracy: 0.5130208333333334


## Try to find optimiaze parameters


In [15]:
# find the beter parameters
# test for best model

# Define the hyperparameter grid
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1], 'kernel': ['linear', 'rbf', 'poly']}
# Create an SVM classifier
svm_classifier = SVC()
# Create the GridSearchCV object
grid_search = GridSearchCV(svm_classifier, param_grid, cv=3, scoring='accuracy')
# Fit the model with different hyperparameter combinations
grid_search.fit(X_train,y_train.idxmax(axis=1).values)
# Get the best hyperparameters
best_params = grid_search.best_params_
# Get the best model
best_model = grid_search.best_estimator_
# Evaluate the best model on the test set
accuracy = best_model.score(X_test, y_test.idxmax(axis=1).values)
print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy on Test Set: {accuracy}")

Best Hyperparameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Accuracy on Test Set: 0.53515625


## OverSamplying with scaled data

### SMOTE


In [16]:
# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=21)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train.idxmax(axis=1).values)
# After oversampling
unique_classes_resampled, class_counts_resampled = np.unique(y_train_resampled, return_counts=True)
for class_label, count in zip(unique_classes_resampled, class_counts_resampled):
    print(f"Frequency of Class {class_label}: {count} instances")

Frequency of Class dzgroup_ARF/MOSF w/Sepsis: 1392 instances
Frequency of Class dzgroup_CHF: 1392 instances
Frequency of Class dzgroup_COPD: 1392 instances
Frequency of Class dzgroup_Cirrhosis: 1392 instances
Frequency of Class dzgroup_Colon Cancer: 1392 instances
Frequency of Class dzgroup_Coma: 1392 instances
Frequency of Class dzgroup_Lung Cancer: 1392 instances
Frequency of Class dzgroup_MOSF w/Malig: 1392 instances


### Evaluation

In [17]:
svm_classifier = SVC(kernel='rbf', C = 10, gamma = 0.1)
svm_classifier.fit(X_train_resampled,y_train_resampled)
y_true = y_test.idxmax(axis=1).values
y_pred = svm_classifier.predict(X_test)
print(classification_report(y_true, y_pred))
# Evaluate the accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

                           precision    recall  f1-score   support

dzgroup_ARF/MOSF w/Sepsis       0.60      0.71      0.65       333
              dzgroup_CHF       0.41      0.38      0.39        96
             dzgroup_COPD       0.31      0.29      0.30        83
        dzgroup_Cirrhosis       0.42      0.33      0.37        49
     dzgroup_Colon Cancer       0.06      0.06      0.06        18
             dzgroup_Coma       0.55      0.39      0.45        77
      dzgroup_Lung Cancer       0.28      0.26      0.27        42
     dzgroup_MOSF w/Malig       0.30      0.24      0.27        70

                 accuracy                           0.48       768
                macro avg       0.37      0.33      0.34       768
             weighted avg       0.47      0.48      0.47       768

Accuracy: 0.484375


### RandomOverSampler

In [19]:
rs = RandomOverSampler(random_state=11)
X_train_resampled, y_train_resampled = rs.fit_resample(X_train, y_train.idxmax(axis=1).values)
# After oversampling
unique_classes_resampled, class_counts_resampled = np.unique(y_train_resampled, return_counts=True)
for class_label, count in zip(unique_classes_resampled, class_counts_resampled):
    print(f"Frequency of Class {class_label}: {count} instances")


Frequency of Class dzgroup_ARF/MOSF w/Sepsis: 1392 instances
Frequency of Class dzgroup_CHF: 1392 instances
Frequency of Class dzgroup_COPD: 1392 instances
Frequency of Class dzgroup_Cirrhosis: 1392 instances
Frequency of Class dzgroup_Colon Cancer: 1392 instances
Frequency of Class dzgroup_Coma: 1392 instances
Frequency of Class dzgroup_Lung Cancer: 1392 instances
Frequency of Class dzgroup_MOSF w/Malig: 1392 instances


### Evaluation

In [20]:
svm_classifier = SVC(kernel='rbf', C = 10, gamma = 0.1)
svm_classifier.fit(X_train_resampled,y_train_resampled)
y_true = y_test.idxmax(axis=1).values
y_pred = svm_classifier.predict(X_test)
print(classification_report(y_true, y_pred))
# Evaluate the accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

                           precision    recall  f1-score   support

dzgroup_ARF/MOSF w/Sepsis       0.61      0.75      0.67       333
              dzgroup_CHF       0.48      0.43      0.45        96
             dzgroup_COPD       0.32      0.29      0.30        83
        dzgroup_Cirrhosis       0.50      0.37      0.42        49
     dzgroup_Colon Cancer       0.06      0.06      0.06        18
             dzgroup_Coma       0.56      0.40      0.47        77
      dzgroup_Lung Cancer       0.20      0.17      0.18        42
     dzgroup_MOSF w/Malig       0.36      0.27      0.31        70

                 accuracy                           0.51       768
                macro avg       0.39      0.34      0.36       768
             weighted avg       0.49      0.51      0.49       768

Accuracy: 0.5091145833333334


# K-Nearest Neighbor

In [22]:
df = pd.read_csv('https://archive.ics.uci.edu/static/public/880/data.csv')
df = df[['age','sex','death','dzgroup','scoma','race','sps','aps','diabetes','dementia','meanbp','wblc','hrt','resp','temp','pafi','alb','bili','crea','sod','ph']]

In [23]:
# one hot encoding race
ohe = pd.get_dummies(df[['race']])
df = df.drop(['race'], axis = 1)
df = pd.concat([df, ohe], axis = 1)

# dropping nan values
df = df.dropna(axis = 0, how = 'any')

# encoding sex
df['sex'].replace('female', 0, inplace=True)
df['sex'].replace('male', 1, inplace=True)

# one hot encoding dzgroup

X = df.copy().drop(columns = 'dzgroup')
y = df['dzgroup']
# df = pd.concat([df, ohe], axis = 1)

In [26]:
X_cat = X[['sex','death','diabetes','dementia']]
X_one_hot = X[['race_asian','race_black','race_hispanic','race_other','race_white']]
X_numerical = X.drop(columns = ['sex','death','diabetes','dementia','race_asian','race_black','race_hispanic','race_other','race_white'])
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X_numerical)
X_scaled = pd.DataFrame(np.concatenate([X_scaled, X_one_hot, X_cat], axis=1))

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=21)

In [24]:
#oversample.  we don't use because it decreases accuracy

# smote = SMOTE()
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [33]:
#KNN. similar result to SVM
k = 10
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_true = y_test
y_pred = knn_classifier.predict(X_test)
print(classification_report(y_true, y_pred, zero_division = 0))

                   precision    recall  f1-score   support

ARF/MOSF w/Sepsis       0.53      0.83      0.65       333
              CHF       0.41      0.35      0.38        96
             COPD       0.30      0.31      0.31        83
        Cirrhosis       0.57      0.16      0.25        49
     Colon Cancer       0.00      0.00      0.00        18
             Coma       0.73      0.31      0.44        77
      Lung Cancer       0.31      0.10      0.15        42
     MOSF w/Malig       0.50      0.10      0.17        70

         accuracy                           0.50       768
        macro avg       0.42      0.27      0.29       768
     weighted avg       0.49      0.50      0.44       768

