In [65]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from sklearn.feature_selection import f_regression, f_classif, mutual_info_classif, SelectFromModel, SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder

In [66]:
data = pd.read_csv('final_data.csv')
data.head()

Unnamed: 0,No,a (Å),delta,Tm (K),D_Tm (K),Hmix (kJ/mol),σHmix (kJ/mol),Sid (kJ/mol),Elec_nega,D_elec_nega,...,AMp,SSsp,FCCp,B2p,BCCp,HCPp,2BCCp,SSp,L12p,2FCCp
0,2,1.544,0.051293,1134.443333,142.109607,-3.555556,0.209513,0.636514,1.823333,0.150849,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1.580308,0.009851,840.646923,311.708141,-1.893491,0.051817,0.666278,1.782308,0.116761,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1.4752,0.069075,675.718,456.594686,-4.8,0.09798,0.673012,1.858,0.058788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,1.548,0.058183,1001.925,403.576498,-3.75,0.541266,0.562335,1.9,0.051962,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,1.60075,0.000271,1000.9825,135.069652,-7.5,1.082532,0.562335,1.465,0.268468,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
# Split data into features (X) and target (Y)
X = data.drop(columns=['BulkModulus (GPa)', 'D_Bulk (GPa)'])
y1 = data['BulkModulus (GPa)'].values
y2 = data['D_Bulk (GPa)'].values

In [68]:
#friction features
f_values, p_values = f_regression(X, y1)
anova_test1 = pd.DataFrame(columns=["features", "f_values", "p_values"])
anova_test1["features"] = X.columns
anova_test1["f_values"] = f_values
anova_test1["p_values"] = p_values
anova_test1 = anova_test1.sort_values(by=["f_values", "p_values"], ascending=False).reset_index(drop=True)


# Select top features
sel = SelectKBest(f_regression, k='all')
sel.fit(X, y1)
selected_feat= X.columns[(sel.get_support())]

# Add other features
features_list = selected_feat.values.tolist()
print("-------------------------------------")
print("Feature List for Friction:\n")
print(features_list)
print("-------------------------------------")
print("No of features:", len(features_list))

-------------------------------------
Feature List for Friction:

['No', 'a (Å)', 'delta', 'Tm (K)', 'D_Tm (K)', 'Hmix (kJ/mol)', 'σHmix (kJ/mol)', 'Sid (kJ/mol)', 'Elec_nega', 'D_elec_nega', 'VEC', 'd_VEC', 'Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'F', 'Fe', 'Ga', 'Gd', 'Ge', 'H', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Ir', 'K', 'La', 'Li', 'Lu', 'Mg', 'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'Ns', 'O', 'Os', 'P', 'Pb', 'Pd', 'Pr', 'Pt', 'Rb', 'Re', 'Rh', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Te', 'Ti', 'Tl', 'Tm', 'V', 'W', 'Y', 'Yb', 'Zn', 'Zr', 'IMsp', 'IMp', 'AMsp', 'AMp', 'SSsp', 'FCCp', 'B2p', 'BCCp', 'HCPp', '2BCCp', 'SSp', 'L12p', '2FCCp']
-------------------------------------
No of features: 103


In [69]:
anova_test1.head()

Unnamed: 0,features,f_values,p_values
0,a (Å),1885.752281,7.714943e-241
1,Elec_nega,1235.266947,4.480027e-182
2,Tm (K),651.885366,2.5863380000000003e-113
3,VEC,305.140318,1.680523e-60
4,SSsp,243.77862,8.544189999999999e-50


In [70]:
#friction features
f_values, p_values = f_regression(X, y2)
anova_test = pd.DataFrame(columns=["features", "f_values", "p_values"])
anova_test["features"] = X.columns
anova_test["f_values"] = f_values
anova_test["p_values"] = p_values
anova_test = anova_test.sort_values(by=["f_values", "p_values"], ascending=False).reset_index(drop=True)


# Select top features
sel = SelectKBest(f_regression, k='all')
sel.fit(X, y2)
selected_feat= X.columns[(sel.get_support())]

# Add other features
features_list = selected_feat.values.tolist()
print("-------------------------------------")
print("Feature List for Friction:\n")
print(features_list)
print("-------------------------------------")
print("No of features:", len(features_list))

-------------------------------------
Feature List for Friction:

['No', 'a (Å)', 'delta', 'Tm (K)', 'D_Tm (K)', 'Hmix (kJ/mol)', 'σHmix (kJ/mol)', 'Sid (kJ/mol)', 'Elec_nega', 'D_elec_nega', 'VEC', 'd_VEC', 'Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'F', 'Fe', 'Ga', 'Gd', 'Ge', 'H', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Ir', 'K', 'La', 'Li', 'Lu', 'Mg', 'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd', 'Ne', 'Ni', 'Ns', 'O', 'Os', 'P', 'Pb', 'Pd', 'Pr', 'Pt', 'Rb', 'Re', 'Rh', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Te', 'Ti', 'Tl', 'Tm', 'V', 'W', 'Y', 'Yb', 'Zn', 'Zr', 'IMsp', 'IMp', 'AMsp', 'AMp', 'SSsp', 'FCCp', 'B2p', 'BCCp', 'HCPp', '2BCCp', 'SSp', 'L12p', '2FCCp']
-------------------------------------
No of features: 103


In [71]:
anova_test.head()

Unnamed: 0,features,f_values,p_values
0,D_elec_nega,320.565804,4.061945e-63
1,D_Tm (K),216.025241,8.687131e-45
2,Zr,199.002622,1.158671e-41
3,FCCp,187.965045,1.296955e-39
4,delta,133.417913,3.272282e-29
