# Final Project - Question 2

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

In [18]:
hcv=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv', index_col=0)
# treat the 1st column as index
hcv.dropna(inplace=True)
hcv.head()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [19]:
map_response = {'0=Blood Donor': '0',
'0s=suspect Blood Donor': '1',
'1=Hepatitis': '2',
'2=Fibrosis': '3',
'3=Cirrhosis': '4'}
hcv['response']=hcv['Category'].map(map_response)
hcv['gender']=[1 if x =='m' else 0 for x in hcv['Sex']]
hcv.drop(['Category','Sex'], axis=1, inplace=True)

In [20]:
#print(hcv.columns)
print(hcv['response'].unique())
print(hcv['gender'].unique())
# if logistic regression to be used, it must be multinomial logistic regression

['0' '1' '2' '3' '4']
[1 0]


# Using K-Nearest Neighbors

1) Standardizing the data set to ensure no variable effects the outcome more than the others

In [21]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
scaler = StandardScaler()

scaler.fit(hcv.drop('response', axis=1))

scaled_features = scaler.transform(hcv.drop('response', axis=1))
scaled_data = pd.DataFrame(scaled_features, columns = hcv.drop('response', axis=1).columns)
print("Scaled Sex:", scaled_data['gender'].unique())

# hcv_encoded.head()
scaled_data.head()

Scaled Sex: [ 0.78904343 -1.26735736]


Unnamed: 0,Age,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,gender
0,-1.553745,-0.542701,-0.60323,-0.905494,-0.355457,-0.202289,-0.581777,-1.916091,0.480336,-0.481022,-0.540788,0.789043
1,-1.553745,-0.542701,0.084054,-0.41138,-0.276283,-0.409283,1.354993,-0.524241,-0.151402,-0.416513,0.862566,0.789043
2,-1.553745,0.916417,0.253944,0.461714,0.573318,-0.282787,0.290683,-0.169629,0.0855,-0.092127,1.386485,0.789043
3,-1.553745,0.27371,-0.622536,0.19307,-0.340231,0.453193,-0.399063,-0.577433,-0.032951,-0.081068,0.712875,0.789043
4,-1.553745,-0.421108,0.230777,0.289014,-0.273238,-0.081542,0.432286,-0.949775,-0.111918,-0.152949,-0.596922,0.789043


Split data into train & test data

In [22]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
X = scaled_data  
y = hcv["response"] #df is your data and response is the name of the response variable 
print(hcv["response"])

seed=100
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=seed, shuffle=True)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape)
print(X_test.shape)

1      0
2      0
3      0
4      0
5      0
      ..
609    4
610    4
611    4
612    4
613    4
Name: response, Length: 589, dtype: object
(412, 12)
(177, 12)


Train a K-Nearest Neighbors Model

In [23]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train, y_train)

Make Predictions with our K-Nearest Neighbors Algorithm

In [24]:
predictions = model.predict(X_test)

Measuring the Accuracy of our model

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       159
           1       0.00      0.00      0.00         1
           2       0.33      0.12      0.18         8
           3       0.00      0.00      0.00         4
           4       0.60      0.60      0.60         5

    accuracy                           0.92       177
   macro avg       0.38      0.34      0.35       177
weighted avg       0.89      0.92      0.90       177



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


This model performs very well with category 0 (blood donors) but does poorly everywhere else. This is because there isn't much data on everything outside category 0

We can try a logistic regression model with balanced class weights to see if that works better

In [None]:
logisticModel = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight="balanced")
logisticModel.fit(X_train, y_train)

logisticPred = logisticModel.predict(X_test)
print(classification_report(y_test, logisticPred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       159
           1       1.00      1.00      1.00         1
           2       0.40      0.25      0.31         8
           3       0.38      0.75      0.50         4
           4       0.67      0.80      0.73         5

    accuracy                           0.94       177
   macro avg       0.69      0.76      0.70       177
weighted avg       0.94      0.94      0.94       177

[[156   0   2   1   0]
 [  0   1   0   0   0]
 [  1   0   2   3   2]
 [  0   0   1   3   0]
 [  0   0   0   1   4]]


