# **Welcome to this CSSA ML Workshop!**

In [None]:
%%capture
from IPython.display import Image

import os
import re
import numpy as np
import pandas as pd
import random
import warnings

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import collections as matcoll
import seaborn as sns
import lightgbm

import sklearn
from sklearn import ensemble
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn import model_selection

from sklearn.metrics import roc_auc_score
from scipy import stats

In [None]:
warnings.filterwarnings('ignore')
matplotlib.rcParams['figure.figsize'] = [15, 7.5]

# Exploring The Data

In [None]:
L6_100nt = pd.read_csv('../input/L6_100nt.csv')

In [None]:
print("Finding columns that contain data about the participant's microbiota")
L6_pattern = re.compile("k__(\w*);p__(\w*);c__(\w*);o__(\w*);f__(\w*);g__(\w*)$")
L3_pattern = re.compile("k__(\w*);p__(\w*);c__(\w*);o__(\w*)$")
L2_pattern = re.compile("k__(\w*);p__(\w*)$")
L6_columns = [col for col in L6_100nt.columns if L6_pattern.match(col)]
L3_columns = [col for col in L6_100nt.columns if L3_pattern.match(col)]
L2_columns = [col for col in L6_100nt.columns if L2_pattern.match(col)]

In [None]:
def filter_data(study):
    """ Removes unwanted rows or modify them to limit the space of the task """
    study = L6_100nt[L6_100nt['STUDY'] == study]

    study["ANTIBIOTIC_HISTORY"] = study["ANTIBIOTIC_HISTORY"].replace("no_data",np.nan).replace("Unspecified",np.nan).replace("Unknown",np.nan)
    study['AGE_CORRECTED'] = study['AGE_CORRECTED'].replace("Unspecified",np.nan).replace("Unknown",np.nan).astype(float)    
    study = study[(study['AGE_CORRECTED'].isnull()) | (study['AGE_CORRECTED'] >= 18)]
    
    study['POULTRY_FREQUENCY'] = study['POULTRY_FREQUENCY'].replace("Occasionally (1-2 times/week)",1).replace("Unspecified",np.nan).replace("Unknown",np.nan)
    study['POULTRY_FREQUENCY'] = study['POULTRY_FREQUENCY'].replace("Regularly (3-5 times/week)",1).replace("Rarely (less than once/week)",0).replace("Never",0).replace("Daily",1)

    return study

In [None]:
print("Filtering based on the study, as many scientific studies were involved")
meta_study = pd.concat([filter_data(study) for study in L6_100nt['STUDY'].unique()])
meta_study = meta_study[~meta_study['#SampleID'].duplicated()]
meta_study = meta_study.fillna(0)
# Filtering data to only consider one source of microbiota in the participant's body 
data = meta_study[meta_study['BODY_SITE'] == 'UBERON:feces']

In [None]:
features = data[L6_columns].var().sort_values(ascending=False).index[:600].tolist()

In [None]:
print(data["ANTIBIOTIC_HISTORY"].value_counts())

In [None]:
print(list(data[features].columns))


# Cleaning The Data

Removing extraneous data and conforming class data to a standard pattern

In [None]:
#Bin the survey responses as True (have taken anitbiotics) and False (have not taken anitbiotics)
week = data[data['ANTIBIOTIC_HISTORY'] == 'Week']
month = data[data['ANTIBIOTIC_HISTORY'] == 'Month']

years = data[data['ANTIBIOTIC_HISTORY'] == 'I have not taken antibiotics in the past year.']

antibiotics = pd.concat([week, month, years])
antibiotics['ANTIBIOTIC_HISTORY'].value_counts()

antibiotics['antibiotic_target'] = (antibiotics['ANTIBIOTIC_HISTORY'] == 'Month') | (antibiotics['ANTIBIOTIC_HISTORY'] == 'Week')
antibiotics['antibiotic_target'].value_counts()

Do you see any potential issues with the number of False instances vs True instances?

In [None]:
from sklearn.utils import resample

no_antibiotics = antibiotics[antibiotics['antibiotic_target'] == False]
yes_antibiotics = antibiotics[antibiotics['antibiotic_target'] == True]

#Implement Down Sampling:
#https://scikit-learn.org/stable/modules/generated/sklearn.utils.resample.html

no_antibiotic_downsampled = resample(no_antibiotics, replace=False, n_samples = 1083)
data_downsampled = pd.concat([no_antibiotic_downsampled, yes_antibiotics])

data_downsampled['antibiotic_target'].value_counts()

In [None]:
from sklearn.decomposition import PCA
data[features].head()
target = data_downsampled['antibiotic_target'].reset_index(drop=True)
target.head()

pca_antibiotics = PCA(n_components=2)

principalComponents_antibiotics = pca_antibiotics.fit_transform(data_downsampled[features])

pca_antibiotics_df = pd.DataFrame(data=principalComponents_antibiotics, columns = ['principal component 1', 'principal component 2'])

pca_antibiotics_df = pd.concat([pca_antibiotics_df, target], axis = 1)

pca_antibiotics_df.head()

pc1 = pca_antibiotics_df['principal component 1'].tolist()
pc2 = pca_antibiotics_df['principal component 2'].tolist()

target = pca_antibiotics_df["antibiotic_target"].tolist()
fig = plt.figure()
ax = fig.add_subplot(111 )
ax.scatter(pc1, pc2,  c=target, cmap="Accent")

# Training A Model

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_downsampled[features], data_downsampled['antibiotic_target'], random_state=1, test_size=0.2)

In [None]:


#Train a decision tree model on the dataset:
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

tree = sklearn.tree.DecisionTreeClassifier()
trained = tree.fit(x_train, y_train)
predictions = tree.predict(x_test)


# Model Evaluation

In [None]:
# Use a confusion matrix to evaluate the perfrormance
cm = confusion_matrix(y_test, predictions)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print(roc_auc_score(y_test, predictions))
pd.DataFrame(data=cm)

In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt

probs = tree.predict_proba(x_test)

skplt.metrics.plot_roc_curve(y_test, probs)
plt.show()

# Identifying People who have ate Chicken with their Microbiota

In [None]:
data['POULTRY_FREQUENCY'].value_counts()

In [None]:
data = data[data['SUBSET_ANTIBIOTIC_HISTORY'] | (data['ANTIBIOTIC_HISTORY'] == 'Year') | (data['ANTIBIOTIC_HISTORY'] == '6 months')]

data = data.groupby(["HOST_SUBJECT_ID"]).first()
data['POULTRY_FREQUENCY'].value_counts()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data[features], data["POULTRY_FREQUENCY"], test_size=0.3)
y_train.value_counts()

In [None]:
# Creating a model with K Nearest Neighbors: initializing it, training it, and predicting classes
model = sklearn.neighbors.KNeighborsClassifier()

trained_model = model.fit(x_train, y_train)
predictions = model.predict(x_test)
cm = confusion_matrix(y_test, predictions)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print(roc_auc_score(y_test, predictions))
pd.DataFrame(data=cm)

In [None]:
# Evaluate using K Fold

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
results = model_selection.cross_val_score(trained_model, x_train, y_train, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

In [None]:
# Creating a model with Random Forest: initializing it, training it, and predicting classes
model = ensemble.RandomForestClassifier(n_estimators=200)
model.fit(x_train, y_train)
model.score(x_test, y_test)

predictions = model.predict(x_test)
cm = confusion_matrix(y_test, predictions)
qcm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print(roc_auc_score(y_test, predictions))
pd.DataFrame(data=cm)

In [None]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

In [None]:
#Bootstrapping the training data to get better results 
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

n_iterations = 10
number_rows = len(data.index)
list_rows = list(range(number_rows))

stats = list()
for i in range(n_iterations):
    train_set_indices = resample(list_rows)
    #Getting out of bag samples
    test_set_indices = [x for x in list_rows if x not in train_set_indices]
    
    x_train = data[features].iloc[train_set_indices,:]
    y_train = data.iloc[train_set_indices,:]['POULTRY_FREQUENCY']
    x_test = data[features].iloc[test_set_indices,:]
    y_test = data.iloc[test_set_indices,:]['POULTRY_FREQUENCY']

    model = ensemble.RandomForestClassifier(n_estimators=200)
    model.fit(x_train, y_train)
    # evaluate model
    predictions = model.predict(x_test)
    score = accuracy_score(y_test, predictions)
    stats.append(score)
    
# plot scores
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

# Curse of Dimensionality Example

In [None]:
dimensions = 300
n_points = 1000

cursed_data = np.random.normal(0, 1, size=(n_points, dimensions))
cursed_label = cursed_data[:,0] > 0

In [None]:
plt.scatter(cursed_data[:,0], y=[0]*(n_points), c=cursed_label, cmap="Accent")


In [None]:
plt.scatter(cursed_data[:,0], cursed_data[:,1], c=cursed_label, cmap="Accent")

In [None]:
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(cursed_data[:,0], cursed_data[:,2], cursed_data[:,1], c=cursed_label, cmap="Accent")

In [None]:
cursed_features = range(0,100)
cursed_df = pd.DataFrame(cursed_data)
cursed_df["target"] = cursed_label
x_train, x_test, y_train, y_test = train_test_split(cursed_df[cursed_features], cursed_df["target"], test_size=0.50)

In [None]:
model = sklearn.neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)

In [None]:
model = sklearn.tree.DecisionTreeClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)