In [5]:
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d, UnivariateSpline

In [2]:
file = "../data/SSCURVES.xlsx"
xls = pd.ExcelFile(file)

In [3]:
# Find curve with smallest stress range
df_list = []
sexes = []
ages = []
person_ids = []

smallest_stress_range = [np.inf, None, None] # [range, idx, number of points]
for sheet in xls.sheet_names:
    df = pd.read_excel(file, sheet_name=sheet)
    try:
        sex = str(df['sex'].iloc[0])
        person_id = str(int(df['id'].iloc[0]))
        try:
            age = int(df['age'].iloc[0])
        except ValueError:
            print(f"Age of {sheet} unknown.")
            age = None
    except KeyError:
        print(f"No age/sex information found for {sheet}")
        continue

    df = df.groupby(["Strain"], as_index=False).agg({'Stress': 'mean'})
    df = df.sort_values("Strain")
   
    # Some data MUST be excluded, right? There are dataseries with less than 10 points.
    # I don't believe those could be good...
    if len(df) < 15:
        continue
    df_list.append(df)
    sexes.append(sex)
    ages.append(age)
    person_ids.append(person_id)


    low = df['Stress'].min()
    high = df['Stress'].max()
    stress_range = high - low
    if stress_range < smallest_stress_range[0]:
        smallest_stress_range = [stress_range, len(df_list) - 1, len(df)]
    

print(f"Case {smallest_stress_range[1]} has the shortest stress range of {smallest_stress_range[0]} with {smallest_stress_range[2]} points.")

Age of 1 unknown.
Age of 2 unknown.
Age of 3 unknown.
Age of 4 unknown.
Age of 5 unknown.
Age of 6 unknown.
Age of 14 unknown.
Age of 15 unknown.
No age/sex information found for 37
No age/sex information found for 48
No age/sex information found for 61
No age/sex information found for 62
No age/sex information found for 555
Case 31 has the shortest stress range of 0.39937500000000004 with 43 points.


In [4]:
# Use spline interpolation.

y_dflist = []
smallest_stress_range_df_idx = int(smallest_stress_range[1])

for i in range(len(df_list)):

    spl = UnivariateSpline(df_list[i]['Strain'],df_list[i]['Stress'])
    spl.set_smoothing_factor(0)
    interpolated = spl(df_list[smallest_stress_range_df_idx]['Strain'])
    y_dflist.append(interpolated)

X = np.array(y_dflist)

In [48]:
t_sne = TSNE(
    learning_rate='auto',
    method='exact',
    perplexity=1,
)
X_embedded = t_sne.fit_transform(X)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.



In [49]:
fig = px.scatter(*X_embedded.T, color=person_ids)
fig.update_layout(legend_title="person")
fig.show()