In [None]:
print("Welcome to this Carleton AI Society workshop!")

In [None]:
%%capture
from IPython.display import Image

import os
import re
import numpy as np
import pandas as pd
import random
import warnings

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import collections as matcoll
import seaborn as sns
import lightgbm

import sklearn
from sklearn import ensemble
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
from sklearn.metrics import *

from sklearn.metrics import roc_auc_score
from scipy import stats

In [None]:
warnings.filterwarnings('ignore')
matplotlib.rcParams['figure.figsize'] = [15, 7.5]

In [None]:
L6_100nt = pd.read_csv('../input/L6_100nt.csv')

In [None]:
print("Finding columns that contain data about the participant's microbiota")
L6_pattern = re.compile("k__(\w*);p__(\w*);c__(\w*);o__(\w*);f__(\w*);g__(\w*)$")
L3_pattern = re.compile("k__(\w*);p__(\w*);c__(\w*);o__(\w*)$")
L2_pattern = re.compile("k__(\w*);p__(\w*)$")
L6_columns = [col for col in L6_100nt.columns if L6_pattern.match(col)]
L3_columns = [col for col in L6_100nt.columns if L3_pattern.match(col)]
L2_columns = [col for col in L6_100nt.columns if L2_pattern.match(col)]

In [None]:
def visualize_data(data, column, title, xAxis):
    """ Just a quick function to plot data easily """
    data[column] = pd.to_numeric(data[column], errors='coerce')
    fig, axes = plt.subplots(1, 2)
    female = data[data['SEX'] == 'female']
    male = data[data['SEX'] == 'male']
    fig.suptitle(title, fontsize=16)
    sns.distplot(male[column], bins=40, kde=False, ax=axes[0]);
    axes[0].set_ylabel('Number of Individuals (male)', fontsize=14)
    axes[0].set_xlabel(xAxis, fontsize=14)
    sns.distplot(female[column], bins=40, kde=False, ax=axes[1], color='r');
    axes[1].set_ylabel('Number of Individuals (female)', fontsize=14)
    axes[1].set_xlabel(xAxis, fontsize=14)
    return fig

def filter_data(study):
    """ Removes unwanted rows or modify them to limit the space of the task """
    study = L6_100nt[L6_100nt['STUDY'] == study]

    study['BMI_CORRECTED'] = study['BMI_CORRECTED'].replace("no_data",np.nan).replace("Unspecified",np.nan).replace("Unknown",np.nan).astype(float)
    study['AGE_CORRECTED'] = study['AGE_CORRECTED'].replace("Unspecified",np.nan).replace("Unknown",np.nan).astype(float)    
    study = study[(study['AGE_CORRECTED'].isnull()) | (study['AGE_CORRECTED'] >= 18)]
    
    subset_underweight = study[(study['BMI_CORRECTED'] < 18.5)]
    subset_healthyweight = study[(study['BMI_CORRECTED'] >= 18.5) & (study['BMI_CORRECTED'] < 25)]
    subset_overweight = study[(study['BMI_CORRECTED'] >= 25) & (study['BMI_CORRECTED'] < 30)]
    subset_obese = study[(study['BMI_CORRECTED'] >= 30) & (study['BMI_CORRECTED'])]
    
    study = pd.concat([subset_underweight, subset_healthyweight, subset_overweight, subset_obese])
    
    #Label Smoothing
    study['SUBSET_UNDERWEIGHT'] = (study['BMI_CORRECTED'] < 18.5).astype(float) * 0.8
    study['SUBSET_HEALTHYWEIGHT'] = ((study['BMI_CORRECTED'] >= 18.5) & (study['BMI_CORRECTED'] < 25)).astype(float) * 0.8
    study['SUBSET_OVERWEIGHT'] = ((study['BMI_CORRECTED'] >= 25) & (study['BMI_CORRECTED'] < 30)).astype(float) * 0.8
    study['SUBSET_OBESE'] = (study['BMI_CORRECTED'] >= 30).astype(float) * 0.8
    return study

In [None]:
# Filtering based on the study, as many scientific studies were involved
meta_study = pd.concat([filter_data(study) for study in L6_100nt['STUDY'].unique()])
meta_study = meta_study[~meta_study['#SampleID'].duplicated()]
meta = meta_study.fillna(0)

In [None]:
features = meta[L6_columns].var().sort_values(ascending=False).index[:600].tolist()

In [None]:
# Filtering data to only consider one source of microbiota in the participant's body 
# Filtering further to ignore participants who recently used antibiotic
data = meta[meta['BODY_SITE'] == 'UBERON:feces']
data = data[data['SUBSET_ANTIBIOTIC_HISTORY'] | (data['ANTIBIOTIC_HISTORY'] == 'Year') | (data['ANTIBIOTIC_HISTORY'] == '6 months')]
data.shape

In [None]:
# Let's look at some of the data with Seaborn or Matplotlib!
sns.set()

In [None]:
# Setting up a section of the data as an experiment
data = data.groupby(["HOST_SUBJECT_ID"]).first()

under = data[data["SUBSET_UNDERWEIGHT"] == 0.8]
over = data[data["SUBSET_OBESE"] == 0.8]

obesity = pd.concat([under, over])

obesity["obesity_target"] = (obesity["SUBSET_OBESE"] == 0.8)

In [None]:
# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(obesity[features], obesity["obesity_target"], test_size=0.3)

In [None]:
# Creating a model: initializing it, training it, and predicting classes
model = sklearn.neighbors.KNeighborsClassifier()

trained_model = model.fit(x_train, y_train)
predictions = model.predict(x_test)
cm = confusion_matrix(y_test, predictions)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print(roc_auc_score(y_test, predictions))
pd.DataFrame(data=cm)

In [None]:
# Hm, how do we solve what we're seeing in that confusion matrix?

In [None]:
# Let's explore the problem further
dimensions = 300
n_points = 1000

cursed_data = np.random.normal(0, 1, size=(n_points, dimensions))
cursed_label = cursed_data[:,0] > 0

In [None]:
plt.scatter(cursed_data[:,0], y=[0]*(n_points), c=cursed_label, cmap="Accent")

In [None]:
plt.scatter(cursed_data[:,0], cursed_data[:,1], c=cursed_label, cmap="Accent")

In [None]:
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(cursed_data[:,0], cursed_data[:,2], cursed_data[:,1], c=cursed_label, cmap="Accent")

In [None]:
cursed_features = range(0,1)
cursed_df = pd.DataFrame(cursed_data)
cursed_df["target"] = cursed_label
x_train, x_test, y_train, y_test = train_test_split(cursed_df[cursed_features], cursed_df["target"], test_size=0.3)

In [None]:
model = sklearn.neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)

In [None]:
# How can we solve that issue?

In [None]:
x_train, x_test, y_train, y_test = train_test_split(obesity[features], obesity["obesity_target"], test_size=0.30)
# What model should we use here?
trained_model = model.fit(x_train, y_train)
predictions = model.predict(x_test)
cm = confusion_matrix(y_test, predictions)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print(roc_auc_score(y_test, predictions))
pd.DataFrame(data=cm)

In [None]:
# What's even better than one model?

In [None]:
# What's even better than one pair of training and testing sets? 

In [None]:
h = plt.hist(trials_1, bins=10, alpha=0.5, label=f"Model 1: {np.mean(trials_1)}")
h = plt.hist(trials_2, bins=10, alpha=0.5, label=f"Model 2: {np.mean(trials_2)}")
h = plt.hist(trials_3, bins=10, alpha=0.5, label=f"Model 3: {np.mean(trials_3)}")
plt.legend(loc='upper left')

In [None]:
sns.countplot(y="TYPES_OF_PLANTS", hue="obesity_target", data=obesity)

In [None]:
print(list(obesity.columns)[:250])

In [None]:
#https://msystems.asm.org/content/3/3/e00031-18