In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from botorch.acquisition.active_learning import qNegIntegratedPosteriorVariance
from botorch.models.gp_regression import SingleTaskGP
from tqdm import tqdm
from torch import Tensor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer,StandardScaler
import os

from botorch.exceptions.warnings import BotorchTensorDimensionWarning, InputDataWarning 

warnings.filterwarnings(
            "ignore",
            message="Input data is not standardized.",
            category=InputDataWarning,
        )

In [3]:
os.getcwd()

'/Users/ramseyissa/Documents/GitHub/qNIPV/notebooks'

In [5]:
df = pd.read_csv('../datasets/citrine_thermal_conductivity.csv')
df

Unnamed: 0,formula,k_expt,k-units,k_condition,k_condition_units
0,BeS,157.0,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
1,CdS,19.9,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
2,GaN,181.0,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
3,ZnO,64.5,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
4,ZnSe,15.6,W/m.K,room temperature,"[{'name': 'Temperature', 'scalars': [{'value':..."
...,...,...,...,...,...
867,SiC,40.0,Wm$^{-1}$K$^{-1}$,1773,K
868,Al2O3,6.0,Wm$^{-1}$K$^{-1}$,1773,K
869,ZrO2,2.4,Wm$^{-1}$K$^{-1}$,1773,K
870,ThO2,2.0,Wm$^{-1}$K$^{-1}$,1773,K


In [7]:
df.head()
df.value_counts()

formula            k_expt    k-units            k_condition       k_condition_units                                                    
Sr0.61Ba0.39Nb2O6  1.6670    W/m$\cdot$K        300               K                                                                        3
TiO2               0.3800    W\m K              Room temperature  [{'name': 'Temperature', 'scalars': [{'value': 'Room temperature'}]}]    3
CuBr               2.7500    W/m.K              room temperature  [{'name': 'Temperature', 'scalars': [{'value': 'room temperature'}]}]    2
GaN                181.0000  W/m.K              room temperature  [{'name': 'Temperature', 'scalars': [{'value': 'room temperature'}]}]    2
BeO                447.0000  W/m.K              room temperature  [{'name': 'Temperature', 'scalars': [{'value': 'room temperature'}]}]    2
                                                                                                                                          ..
CuCr0.97Mg0.03O2  

In [8]:
df['k_condition'].value_counts()

k_condition
300                 204
400                 187
700                 183
1000                129
room temperature     42
773                  25
373                  24
Room temperature     22
298                  19
1273                 19
1773                 10
Standard              8
Name: count, dtype: int64

In [10]:
for val in df['k_condition'].values:
    if val == 'room temperature':
        df['k_condition'] = df['k_condition'].replace(val, 300)
    elif val == 'Standard':
        df['k_condition'] = df['k_condition'].replace(val, 300)
    elif val == 'Room temperature':
        df['k_condition'] = df['k_condition'].replace(val, 300)
    else:
        pass

    

In [11]:
df['k_condition'].value_counts()

k_condition
300     204
400     187
700     183
1000    129
300      72
773      25
373      24
298      19
1273     19
1773     10
Name: count, dtype: int64

In [15]:
df['k_condition'].value_counts()

k_condition
300     204
400     187
700     183
1000    129
300      72
773      25
373      24
298      19
1273     19
1773     10
Name: count, dtype: int64

In [17]:
df.columns

Index(['formula', 'k_expt', 'k-units', 'k_condition', 'k_condition_units'], dtype='object')

In [18]:
df.head()
df.drop(columns=['k-units','k_condition_units'],inplace=True)
df.head()

Unnamed: 0,formula,k_expt,k_condition
0,BeS,157.0,300
1,CdS,19.9,300
2,GaN,181.0,300
3,ZnO,64.5,300
4,ZnSe,15.6,300


In [24]:
x = 0
for indx,row in df.iterrows():
    if row['k_condition'] == 300 or row['k_condition'] == 298:
        x += 1
print(x)
        
# convert the k_condition column to float
df['k_condition'] = df['k_condition'].astype(float)



72


In [29]:
df['k_condition'].value_counts()

#create mask for certain values in the k_condition column
mask = (df['k_condition'] == 300.0) | (df['k_condition'] == 298.0)
df_mask = df[mask]
df_mask


Unnamed: 0,formula,k_expt,k_condition
0,BeS,157.0,300.0
1,CdS,19.9,300.0
2,GaN,181.0,300.0
3,ZnO,64.5,300.0
4,ZnSe,15.6,300.0
...,...,...,...
796,SiO2,11.0,298.0
797,Al2O3,38.0,298.0
798,ZrO2,1.8,298.0
799,ThO2,14.0,298.0


In [30]:
df_mask.reset_index(drop=True,inplace=True)
df_mask

Unnamed: 0,formula,k_expt,k_condition
0,BeS,157.0,300.0
1,CdS,19.9,300.0
2,GaN,181.0,300.0
3,ZnO,64.5,300.0
4,ZnSe,15.6,300.0
...,...,...,...
290,SiO2,11.0,298.0
291,Al2O3,38.0,298.0
292,ZrO2,1.8,298.0
293,ThO2,14.0,298.0


In [31]:
df_mask.drop(columns=['k_condition'],inplace=True)
df_mask

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mask.drop(columns=['k_condition'],inplace=True)


Unnamed: 0,formula,k_expt
0,BeS,157.0
1,CdS,19.9
2,GaN,181.0
3,ZnO,64.5
4,ZnSe,15.6
...,...,...
290,SiO2,11.0
291,Al2O3,38.0
292,ZrO2,1.8
293,ThO2,14.0


In [41]:
type(df_mask['formula'].value_counts())

pandas.core.series.Series

In [44]:
df_mask['formula'].value_counts()

formula
TiO2                          22
Ba8Ga16Ge30                    4
Zn4Sb3                         4
SiC                            3
SiO2                           3
                              ..
Mo6Te6S2                       1
Ca0.7Y0.3MnO3                  1
Ca0.98Bi0.02Mn0.98Nb0.02O3     1
Ti0.98Nb0.02NiSn               1
MgO                            1
Name: count, Length: 233, dtype: int64

In [39]:
df_mask['formula'].nunique()

233

In [40]:
len(df_mask['formula'])

295

In [46]:
df_mask[df_mask['formula'] == 'TiO2']

Unnamed: 0,formula,k_expt
254,TiO2,0.44
255,TiO2,0.38
256,TiO2,0.53
257,TiO2,0.34
258,TiO2,1.11
259,TiO2,0.38
260,TiO2,0.59
261,TiO2,1.29
262,TiO2,1.05
263,TiO2,1.26


In [47]:
df_mask[df_mask['formula'] == 'TiO2']['k_expt'].values.mean()


0.7622727272727272

In [43]:
for vals in df_mask['formula'].value_counts():
    if vals > 1:
        print(vals)



22
4
4
3
3
3
3
3
3
3
3
3
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
