In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_excel("/content/data.xlsx")
df.columns = df.columns.str.strip()

In [None]:
df.groupby(['DLS (nm)', 'UV VIS']).ngroups

132

In [None]:
list[df.groupby(['DLS (nm)', 'UV VIS']).groups.keys()]

list[dict_keys([(52.7, 1.613), (53.0, 2.058), (78.1, 0.863), (78.2, 1.695), (79.1, 0.307), (82.5, 2.076), (83.1, 0.768), (84.7, 1.864), (85.4, 0.789), (87.54269166292646, 0.7222914695739746), (87.54269166292646, 1.891807675361633), (91.5, 0.987), (91.9, 0.321), (92.4, 2.116), (94.13255359982115, 0.7027695178985596), (94.13255359982115, 0.7222914695739746), (94.13255359982115, 1.891807675361633), (95.05355310960438, 0.7027695178985596), (95.05355310960438, 0.7222914695739746), (95.05355310960438, 1.676485538482666), (95.2, 2.246), (95.3, 2.004), (96.8, 1.826), (99.3, 1.965), (103.5, 2.04), (104.2, 1.889), (105.1, 0.688), (105.9, 1.533), (106.2, 2.132), (107.7, 1.911), (107.9, 0.656), (108.0, 0.397), (108.0, 0.924), (108.0, 1.76), (108.6, 0.416), (110.6, 0.636), (111.2567586697162, 0.7222914695739746), (111.2567586697162, 1.676485538482666), (111.9, 0.272), (112.5153031161555, 0.7222914695739746), (112.5153031161555, 1.676485538482666), (114.4408904368613, 0.7027695178985596), (114.44089

In [None]:
df.head()

Unnamed: 0,Time (min),Scanspeed (mm/s),Fluence (J/cm2),DLS (nm),UV VIS
0,7,3240,1.87504,151.960143,0.722291
1,25,3152,1.882903,114.44089,1.891808
2,8,3400,1.894592,142.52502,0.70277
3,5,3371,1.907519,170.006049,0.70277
4,7,3288,1.856212,151.960143,0.70277


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1323 entries, 0 to 1322
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Time (min)        1323 non-null   int64  
 1   Scanspeed (mm/s)  1323 non-null   int64  
 2   Fluence (J/cm2)   1323 non-null   float64
 3   DLS (nm)          1323 non-null   float64
 4   UV VIS            1323 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 51.8 KB


In [None]:
df.describe()

Unnamed: 0,Time (min),Scanspeed (mm/s),Fluence (J/cm2),DLS (nm),UV VIS
count,1323.0,1323.0,1323.0,1323.0,1323.0
mean,13.424036,3257.126228,1.868981,133.241366,0.74032
std,7.211161,145.243543,0.023965,25.929576,0.285328
min,2.0,3000.0,1.83,52.7,0.219
25%,7.0,3131.0,1.848221,114.44089,0.70277
50%,14.0,3254.0,1.868963,136.331734,0.70277
75%,20.0,3378.5,1.889642,151.960143,0.722291
max,25.0,3500.0,1.91,239.8,2.541


In [None]:
input_scaler = MinMaxScaler()
output_scaler = MinMaxScaler()

In [None]:
# Normalize OUTPUT space for similarity (DLS, UV VIS)
df[['DLS_norm', 'UVVIS_norm']] = output_scaler.fit_transform(df[['DLS (nm)', 'UV VIS']])

# Normalize INPUT space (used later for ranking and cost)
df[['Time_norm', 'ScanSpeed_norm', 'Fluence_norm']] = input_scaler.fit_transform(
    df[['Time (min)', 'Scanspeed (mm/s)', 'Fluence (J/cm2)']]
)

In [None]:
df.head()

Unnamed: 0,Time (min),Scanspeed (mm/s),Fluence (J/cm2),DLS (nm),UV VIS,DLS_norm,UVVIS_norm,Time_norm,ScanSpeed_norm,Fluence_norm
0,7,3240,1.87504,151.960143,0.722291,0.530519,0.216749,0.217391,0.48,0.562994
1,25,3152,1.882903,114.44089,1.891808,0.329989,0.720417,1.0,0.304,0.661285
2,8,3400,1.894592,142.52502,0.70277,0.480091,0.208342,0.26087,0.8,0.807404
3,5,3371,1.907519,170.006049,0.70277,0.62697,0.208342,0.130435,0.742,0.968985
4,7,3288,1.856212,151.960143,0.70277,0.530519,0.208342,0.217391,0.576,0.327651


## Cosine Similarity

In [None]:
def recommend_by_similarity(dls_target, uvvis_target, top_n=5):
    """
    Recommend synthesis settings based on desired DLS and UV VIS output using cosine similarity.
    """
    # Normalize the input target using output_scaler (not input_scaler)
    target_df = pd.DataFrame([[dls_target, uvvis_target]], columns=['DLS (nm)', 'UV VIS'])
    target_norm = output_scaler.transform(target_df)
    target_vector = target_norm.reshape(1, -1)

    # Compute cosine similarity with output space
    sim_scores = cosine_similarity(target_vector, df[['DLS_norm', 'UVVIS_norm']].values)[0]

    # Store and sort top matches
    df['similarity'] = sim_scores
    top_recommendations = df.sort_values(by='similarity', ascending=False).head(top_n)

    return top_recommendations[[
        'Time (min)', 'Scanspeed (mm/s)', 'Fluence (J/cm2)',
        'DLS (nm)', 'UV VIS', 'similarity'
    ]]


In [None]:
recommendations = recommend_by_similarity(150, 0.7, top_n=5)
print(recommendations)


      Time (min)  Scanspeed (mm/s)  Fluence (J/cm2)    DLS (nm)    UV VIS  \
1316          10              3072         1.904964  155.440949  0.722291   
403            8              3055         1.895954  155.440949  0.722291   
386            8              3099         1.909864  155.440949  0.722291   
490            7              3025         1.906465  155.440949  0.722291   
75             8              3055         1.895954  155.440949  0.722291   

      similarity  
1316    0.999995  
403     0.999995  
386     0.999995  
490     0.999995  
75      0.999995  


Cosine similarity is fine when scale doesn't matter, but in our case, we need to know how close things really are, especially for scientific outputs like DLS and UV.

# Eucledian distance:

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

def recommend_by_distance(dls_target, uvvis_target, top_n=5):
    target_df = pd.DataFrame([[dls_target, uvvis_target]], columns=['DLS (nm)', 'UV VIS'])
    target_norm = output_scaler.transform(target_df)
    target_vector = target_norm.reshape(1, -1)

    dist_scores = euclidean_distances(target_vector, df[['DLS_norm', 'UVVIS_norm']].values)[0]
    df['distance'] = dist_scores

    top_recommendations = df.sort_values(by='distance', ascending=True).head(top_n)

    return top_recommendations[[
        'Time (min)', 'Scanspeed (mm/s)', 'Fluence (J/cm2)',
        'DLS (nm)', 'UV VIS', 'distance'
    ]]


In [None]:
recommend_by_distance(150, 0.72)

Unnamed: 0,Time (min),Scanspeed (mm/s),Fluence (J/cm2),DLS (nm),UV VIS,distance
0,7,3240,1.87504,151.960143,0.722291,0.010523
786,5,3163,1.855161,151.960143,0.722291,0.010523
785,13,3192,1.861058,151.960143,0.722291,0.010523
47,8,3179,1.861224,151.960143,0.722291,0.010523
712,5,3239,1.862658,151.960143,0.722291,0.010523


## Combine Cosine + Distance as Weighted Score

In [None]:
def recommend_by_combined_score(dls_target, uvvis_target, top_n=5, alpha=0.5):
    # alpha ∈ [0, 1], balance between cosine (angle) and distance (magnitude)
    target_df = pd.DataFrame([[dls_target, uvvis_target]], columns=['DLS (nm)', 'UV VIS'])
    target_norm = output_scaler.transform(target_df)
    target_vector = target_norm.reshape(1, -1)

    sim = cosine_similarity(target_vector, df[['DLS_norm', 'UVVIS_norm']].values)[0]
    dist = euclidean_distances(target_vector, df[['DLS_norm', 'UVVIS_norm']].values)[0]

    # Normalize both to 0–1 range
    sim_norm = (sim - np.min(sim)) / (np.max(sim) - np.min(sim))
    dist_norm = (dist - np.min(dist)) / (np.max(dist) - np.min(dist))

    # Combined score: higher is better
    df['score'] = alpha * sim_norm + (1 - alpha) * (1 - dist_norm)
    top_recommendations = df.sort_values(by='score', ascending=False).head(top_n)

    return top_recommendations[[
        'Time (min)', 'Scanspeed (mm/s)', 'Fluence (J/cm2)',
        'DLS (nm)', 'UV VIS', 'score'
    ]]


In [None]:
recommend_by_combined_score(120, 0.7, alpha=0.).sort_values(by=['Time (min)', 'Fluence (J/cm2)'], ascending=True)

Unnamed: 0,Time (min),Scanspeed (mm/s),Fluence (J/cm2),DLS (nm),UV VIS,score
1222,18,3494,1.90283,121.210922,0.70277,1.0
481,21,3411,1.898108,121.210922,0.70277,1.0
564,23,3465,1.895562,121.210922,0.70277,1.0
741,23,3401,1.899205,121.210922,0.70277,1.0
863,24,3424,1.907763,121.210922,0.70277,1.0


## KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

def inverse_knn_predictor(dls_target, uvvis_target, k=9):
    """
    Predicts [Time, Scanspeed, Fluence] using KNN given desired [DLS, UV VIS].
    """
    # Train inverse model
    X = df[['DLS (nm)', 'UV VIS']]
    y = df[['Time (min)', 'Scanspeed (mm/s)', 'Fluence (J/cm2)']]

    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X, y)

    # Predict for new output
    target = np.array([[dls_target, uvvis_target]])
    predicted = model.predict(target)

    return pd.DataFrame(predicted, columns=['Time (min)', 'Scanspeed (mm/s)', 'Fluence (J/cm2)'])


In [None]:
inverse_knn_predictor(120, 0.7, k=5)

Unnamed: 0,Time (min),Scanspeed (mm/s),Fluence (J/cm2)
0,14.6,3291.8,1.902541
