In [27]:
import numpy as np
import pandas as pd
import requests
import io
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns   
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif


In [28]:

url = "https://cdn.jsdelivr.net/gh/ramenfeast/BV-ethnicity-report/BV%20Dataset%20copy.csv"
download = requests.get(url).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')))

#%%Clean data
df = df.drop([394,395,396], axis = 0)

#%% Separate the Data and Labels
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

#%% Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=0)
#%% Extract Ethinic group and commmunity group data
es_xtest = X_test[['Ethnic Groupa']].copy()
cs_xtest = X_test[['Community groupc ']].copy()
X_test=X_test.drop(labels= ['Ethnic Groupa', 'Community groupc '], axis=1)


es_xtrain = X_train[['Ethnic Groupa']].copy()
cs_xtrain = X_train[['Community groupc ']].copy()
X_train=X_train.drop(labels= ['Ethnic Groupa', 'Community groupc '], axis=1)

#%%Normalization

#Normalize pH
X_train['pH']=X_train['pH']/14
X_test['pH']=X_test['pH']/14

#Normalize 16s RNA data
X_train.iloc[:,1::]=X_train.iloc[:,1::]/100
X_test.iloc[:,1::]=X_test.iloc[:,1::]/100

#%%Binary y
y_train[y_train<7]=0
y_train[y_train>=7]=1

y_test[y_test<7]=0
y_test[y_test>=7]=1

print(y_test)

113    1.0
281    0.0
68     0.0
155    0.0
329    1.0
      ... 
90     0.0
124    0.0
64     0.0
55     0.0
300    1.0
Name: Nugent score, Length: 79, dtype: float64


In [29]:
#Get features highest ftest
fvalue_Best = SelectKBest(f_classif, k=46)
fvalue_Best.fit(X_train, y_train)
#print(X_kbest)
#print('Original number of features:', X.shape)
#print('Reduced number of features:', X_kbest.shape)

cols = fvalue_Best.get_support(indices=True) 
features_df_newtrain = X_train.iloc[:,cols]
features_df_newtest = X_test.iloc[:,cols]
#print(features_df_new)
features_df_newtrain.info()

#print(features_df_new_target.shape)
#print(features_df_new.shape[1])

#original RF
clfrf = RandomForestClassifier(n_estimators = 100, random_state=0)
clfrf.fit(X_train, y_train)
y_pred = clfrf.predict(X_test)
print(f'Accuracy Score RF = {accuracy_score(y_test, y_pred)}')

#Ftest RF
clfrf_imp = RandomForestClassifier(n_estimators=100)
clfrf_imp.fit(features_df_newtrain,y_train)
y_pred_imp = clfrf_imp.predict(features_df_newtest)
print(f'Important Features Accuracy Score RF = {accuracy_score(y_test, y_pred_imp)}')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 315 entries, 215 to 172
Data columns (total 46 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   pH                              315 non-null    float64
 1   L. iners                        315 non-null    float64
 2   L. crispatus                    315 non-null    float64
 3   L. jensenii                     315 non-null    float64
 4   Prevotella                      315 non-null    float64
 5   Megasphaera                     315 non-null    float64
 6   Sneathia                        315 non-null    float64
 7   Atopobium                       315 non-null    float64
 8   Dialister                       315 non-null    float64
 9   Lachnospiraceae_8               315 non-null    float64
 10  Anaerococcus                    315 non-null    float64
 11  Peptoniphilus                   315 non-null    float64
 12  Eggerthella                     31

  f = msb / msw
