In [5]:
import pandas as pd

# Read the Excel file, skipping the first 3 rows
df = pd.read_excel("Doc_mnemonic.xlsx", skiprows=3)

# Set the first row as the column names
df.columns = df.iloc[0]

# Drop the first row since it's now the column names
df = df[1:].reset_index(drop=True)

# Drop rows that contain only NaN values
df = df.dropna(how='all')

# Display the first 218 rows
df.head(3)

Unnamed: 0,"Регистрируемые (расчетные) параметры, единицы измерения",Name method,русское,латиницей,Baker Hughes,Halliburton,Schlumberger,Pathfinder,Sperry-Sun,Weatherford (Reeves),Синонимы
0,"Разность потенциалов электрического поля, мВ","Spontaneous Potential Logging, mV",ПС,PS,SP,SP,SP,-,-,"SP, SPR, SPCG, SPDL, SPLL, SPRL, CGSP","СП, PS, RSP"
1,"Разность потенциалов электрического поля, мВ","Spontaneous Gradient Potential Logging, mV",ПСГ,SPGL,-,-,-,-,-,-,-
2,"Разность потенциалов электрического поля, мВ","Electrode Potential Logging, mV",ЭП,EPL,-,-,-,-,-,-,-


In [6]:
df = df.fillna('-')

merged_text = df.iloc[:].apply(lambda row: ','.join(row[4::].astype(str)), axis=1)

# Add the merged text as a new column
df['Merged Text'] = pd.Series(merged_text).fillna('')

df = df.drop(df.columns[4:11], axis=1)



In [7]:
# Function to split and replace
def process_merged_text(text):
    if pd.notna(text):
        parts = text.split(',')
        return [part for part in parts if part != '-']
    return []

# Apply the function to the 'Merged Text' column
df['Merged Text'] = df['Merged Text'].apply(process_merged_text)
df

Unnamed: 0,"Регистрируемые (расчетные) параметры, единицы измерения",Name method,русское,латиницей,Merged Text
0,"Разность потенциалов электрического поля, мВ","Spontaneous Potential Logging, mV",ПС,PS,"[SP, SP, SP, SP, SPR, SPCG, SPDL, SPLL, S..."
1,"Разность потенциалов электрического поля, мВ","Spontaneous Gradient Potential Logging, mV",ПСГ,SPGL,[]
2,"Разность потенциалов электрического поля, мВ","Electrode Potential Logging, mV",ЭП,EPL,[]
3,"Разность потенциалов электрического поля, мВ","Two-electrode Potential Logging, mV",КПГП,TEPL,[]
4,"Разность потенциалов вызванной поляризации, мВ","Induced Potential Logging, mV",ВП,IPL,[]
...,...,...,...,...,...
706,Кривая восстановления давления,-,КВД,KVD,[]
707,Кривая стабилизации давления,-,КСД,KSD,[]
708,Кривая восстановления температуры,-,КВТ,KVT,[]
709,Кривая стабилизации температуры,-,КСТ,KST,[]


In [8]:
import re 

def split_and_clean(arr):
    if isinstance(arr, list):
        cleaned_parts = []
        for part in arr:
            match = re.match(r'(.*?)\s*\((.*?)\)', part)
            if match:
                cleaned_parts.append(match.group(1).strip())
                cleaned_parts.append(match.group(2).strip())
            else:
                cleaned_parts.append(part.strip())
        return cleaned_parts
    return []
df['Merged Text'] = df['Merged Text'].apply(split_and_clean)


In [9]:
df.columns = ["RU_COM", "ENG_COM", "RU", "ENG", "Merged"]

In [10]:
df

Unnamed: 0,RU_COM,ENG_COM,RU,ENG,Merged
0,"Разность потенциалов электрического поля, мВ","Spontaneous Potential Logging, mV",ПС,PS,"[SP, SP, SP, SP, SPR, SPCG, SPDL, SPLL, SPRL, ..."
1,"Разность потенциалов электрического поля, мВ","Spontaneous Gradient Potential Logging, mV",ПСГ,SPGL,[]
2,"Разность потенциалов электрического поля, мВ","Electrode Potential Logging, mV",ЭП,EPL,[]
3,"Разность потенциалов электрического поля, мВ","Two-electrode Potential Logging, mV",КПГП,TEPL,[]
4,"Разность потенциалов вызванной поляризации, мВ","Induced Potential Logging, mV",ВП,IPL,[]
...,...,...,...,...,...
706,Кривая восстановления давления,-,КВД,KVD,[]
707,Кривая стабилизации давления,-,КСД,KSD,[]
708,Кривая восстановления температуры,-,КВТ,KVT,[]
709,Кривая стабилизации температуры,-,КСТ,KST,[]


In [11]:
rus_com = {}
eng_com = {}

for index, row in df.iterrows():
    key = row["RU"]
    value = row["RU_COM"]    
    rus_com[key] = value
    
    key = row["ENG"]
    value = row["ENG_COM"]    
    eng_com[key] = value

In [12]:
rus = {}
eng = {}

for index, row in df.iterrows():
    value = row['RU']
    keys = [row['ENG']] + row['Merged'] + [row["RU"]]
    
    for k in keys:# Remove empty strings or empty lists
        rus[k] = value
        
for index, row in df.iterrows():
    value = row['ENG']
    keys = [row['ENG']] + row['Merged'] + [row["RU"]]
    
    for k in keys:# Remove empty strings or empty lists
        eng[k] = value
        

In [13]:
import json



with open("Json/eng_com.json", 'w') as json_file:
    json.dump(eng_com, json_file)
    
with open("Json/rus_com.json", 'w') as json_file:
    json.dump(rus_com, json_file)
    
with open("Json/eng.json", 'w') as json_file:
    json.dump(eng, json_file)
    
with open("Json/rus.json", 'w') as json_file:
    json.dump(rus, json_file)
