In [1]:
# Converts raw pattern data and labels into training datasets
import glob
import numpy as np
import os
from IPython.display import clear_output

# input: all ICSD individual calculated XRD .txt files
xrdDir = "database_xrds/ExampleXRDs/"

# output folder
config2Theta = "HighRes2Theta_5to90"
outDir = f"database_datasets/{config2Theta}/ExampleSet/"
if not os.path.exists(outDir):
    os.makedirs(outDir)

# output: one .csv file for 7-way labels
labels7Dir = outDir + "labels7.csv"
# output: one .csv file for 230-way labels
labels230Dir = outDir + "labels230.csv"
# output: one .csv file for features
featuresDir = outDir + "features.csv"
# output: one file for ICSD ids
idsDir = outDir + "ids.csv"

# create output files
with open(labels7Dir, "w") as labels7:
    pass
with open(labels230Dir, "w") as labels230:
    pass
with open(featuresDir, "w") as features:
    pass
with open(idsDir, "w") as ids:
    pass

#Creat List of CIFs
ListCIFs = os.listdir('CIFs')

count = 0
failCount = 0
# main loop
for entry in ListCIFs:
    # convert cleaned index to pattern index
    fileName = entry.strip("[\n']").replace(".cif", ".txt")
    # locate the pattern
    fileDir = xrdDir + "/" + fileName
    count += 1
    print(f"{fileName} count:{count} failCount:{failCount}")
    clear_output(wait = True)
    # cleaned index might have cifs that fails to calculate XRD
    # check if corresponding xrd exist
    if os.path.isfile(fileDir):
        # read the pattern file
        with open(fileDir) as xrdFile:
            xrdLines = xrdFile.readlines()
        # convert pattern file header to 7-ways
        with open(labels7Dir, "a") as labels7:
            labels7Array = np.zeros((1, 7))
            # 2nd line of pattern file is the 7-way label
            labels7Array[0, int(xrdLines[1].split()[1]) - 1] = 1
            labels7Array = labels7Array.astype(int)
            # save array
            np.savetxt(labels7, labels7Array, fmt="%d", delimiter=",")
            # the old way is commented and replaced by np.savetxt
#             labels7.write("\n".join(str(item).replace("[", "").replace("]", "") for item in labels7Array.tolist()))
#             labels7.write("\n")
        with open(labels230Dir, "a") as labels230:
            labels230Array = np.zeros((1, 230))
            labels230Array[0, int(xrdLines[2].split()[1]) - 1] = 1
            labels230Array = labels230Array.astype(int)
            # save array
            np.savetxt(labels230, labels230Array, fmt="%d", delimiter=",")
            # the old way is commented and replaced by np.savetxt
#             labels230.write("\n".join(str(item).replace("[", "").replace("]", "") for item in labels230Array.tolist()))
#             labels230.write("\n")
        with open(featuresDir, "a") as features:
            featuresVector = np.zeros((1, 8500))
            i = 0
            for i in range (0, 8500):
                featuresVector[0, i] = float(xrdLines[i+3+500].split()[1]) * 1000
            featuresVector = featuresVector.astype(int)
            # save array
            np.savetxt(features, featuresVector, fmt="%d", delimiter=",")
#             features.write("\n".join(str(item).replace("[", "").replace("]", "") for item in featuresVector.tolist()))
#             features.write("\n")
        with open(idsDir, "a") as ids:
            ids.write(fileName.replace(".txt", "").strip())
            ids.write("\n")
    else: failCount += 1
print(failCount)

0
