# Needed modules

In [112]:
import numpy as np
import pandas as pd

In [113]:
# https://kioku-space.com/en/jupyter-skip-execution/
from IPython.core.magic import register_cell_magic # type: ignore

@register_cell_magic
def skip(line, cell):
    return

# Pickle save

In [114]:
%%skip
# =============================================================================
# Save the variables
# =============================================================================
variables_dict = {
}

# =============================================================================
# main function
# =============================================================================
def data_save_load(option, dict_variables=None):
    """
    This function is used to save or load data for the jupyter notebook
    """
    path_folder = "ipynb_db"  # Folder to save variables
    os.makedirs(path_folder, exist_ok=True)  # Create folder if not exist
    path_file = os.path.join(path_folder, "variables.pkl") # Path to save the variables

    if option == "save":
        with open(path_file, "wb") as f:
            pickle.dump(dict_variables, f)
    elif option == "load":
        with open(path_file, "rb") as f:
            variables = pickle.load(f)
        # Now load the variables
        for key, value in variables.items():
            variables[key] = value

# =============================================================================
# Call the function
# =============================================================================
data_save_load(option="save",
               dict_variables=variables_dict)


# 1. Preparing data

Bear in mind the original wikidata table has like 36 tabs in the excel file that are of our interest. Apart from that 36 files there are some that we don't want, such as "VARIOS" or "C-Sastre"

## 1.1 Load data

In [115]:
# Path to the data
path_wiki_xlsx = "/home/viskuit/Downloads/tmp/20240723142835_LinJPC-Wikipedia-2024.xlsx"

In [116]:
# Needed columns in the excel file
needed_sheets = [f"Crom{i}" for i in range (1, 36+1)]
print(needed_sheets) 

['Crom1', 'Crom2', 'Crom3', 'Crom4', 'Crom5', 'Crom6', 'Crom7', 'Crom8', 'Crom9', 'Crom10', 'Crom11', 'Crom12', 'Crom13', 'Crom14', 'Crom15', 'Crom16', 'Crom17', 'Crom18', 'Crom19', 'Crom20', 'Crom21', 'Crom22', 'Crom23', 'Crom24', 'Crom25', 'Crom26', 'Crom27', 'Crom28', 'Crom29', 'Crom30', 'Crom31', 'Crom32', 'Crom33', 'Crom34', 'Crom35', 'Crom36']


In [117]:
# Read the data
pre_wiki_df = pd.read_excel(path_wiki_xlsx, 
                            sheet_name=needed_sheets,
                            engine='openpyxl')

In [118]:
# Merge all Data Frames into a single one
wiki_df = pd.concat(pre_wiki_df.values(), ignore_index=True, sort=False)

# Replace missing values with NaN
wiki_df = wiki_df.fillna(value=np.nan)

In [119]:
# Inspect data frame
print(wiki_df.shape)
print(wiki_df.dtypes)
wiki_df.head()

(8881, 30)
Etiqueta                  object
Wikidata                  object
Mendeley dataset          object
Description               object
Descripción               object
Otros nombres             object
LMJFC ortólogo            object
LmjF ortólogo             object
Función molecular         object
Referencia1               object
Referencia2               object
Otras Referencias         object
Gen                       object
CDS                       object
Proteina                  object
Notas                     object
Proteoma Experimental     object
LBRM2904 ortólogo         object
Wikidata-LbrM2904         object
LdHU3 ortólogo            object
Otras referencias         object
Proteína                  object
Gene                      object
OtrasReferencias          object
LbrM2904 ortólogo         object
LbrM ortólogo             object
Unnamed: 16              float64
Aminoácidos               object
Imagen                    object
Imagen Wiki               object

Unnamed: 0,Etiqueta,Wikidata,Mendeley dataset,Description,Descripción,Otros nombres,LMJFC ortólogo,LmjF ortólogo,Función molecular,Referencia1,...,Otras referencias,Proteína,Gene,OtrasReferencias,LbrM2904 ortólogo,LbrM ortólogo,Unnamed: 16,Aminoácidos,Imagen,Imagen Wiki
0,LINF_010005000,Q62005644,http://dx.doi.org/10.17632/m9mggnd5w4.1,Protein of unknown function (DUF2946),Proteína de función desconocida con dominio DU...,LinJ.01.0010,LMJFC_010005100,LmjF.01.0010,,,...,,,,,,,,,,
1,LINF_010005100,Q65246947,http://dx.doi.org/10.17632/d99ycn2bhb.1,Endonuclease/Exonuclease/phosphatase family,proteína de la familia de endonucleasa/exonucl...,LinJ.01.0020,LMJFC_010005200,LmjF.01.0020,,,...,,,,,,,,,,
2,LINF_010005200,Q65459375,http://dx.doi.org/10.17632/vsmrprnpb5.2,Kinesin-13,Kinesina-13,LinJ.01.0030,LMJFC_010005300,LmjF.01.0030,,,...,,,,,,,,,,
3,LINF_010005300,Q65461401,http://dx.doi.org/10.17632/78btgwdkx9.1,hypothetical protein - conserved,Proteína hipotética - conservada,LinJ.01.0040,LMJFC_010005400,LmjF.01.0040,,,...,,,,,,,,,,
4,LINF_010005400,Q65463295,http://dx.doi.org/10.17632/b79hrctkww.1,Propionyl-CoA carboxylase - α-subunit,Propionil-CoA carboxilasa - subunidad α,LinJ.01.0050,LMJFC_010005500,LmjF.01.0050,,"Lee, J.K.J., Liu, Y.T., Hu, J.J., Aphasizheva,...",...,,,,,,,,,,


## 1.2 Modify data

In [120]:
# Let's check again only the columns
wiki_df.dtypes

Etiqueta                  object
Wikidata                  object
Mendeley dataset          object
Description               object
Descripción               object
Otros nombres             object
LMJFC ortólogo            object
LmjF ortólogo             object
Función molecular         object
Referencia1               object
Referencia2               object
Otras Referencias         object
Gen                       object
CDS                       object
Proteina                  object
Notas                     object
Proteoma Experimental     object
LBRM2904 ortólogo         object
Wikidata-LbrM2904         object
LdHU3 ortólogo            object
Otras referencias         object
Proteína                  object
Gene                      object
OtrasReferencias          object
LbrM2904 ortólogo         object
LbrM ortólogo             object
Unnamed: 16              float64
Aminoácidos               object
Imagen                    object
Imagen Wiki               object
dtype: obj

The only columns needed are the next ones:
* **Etiqueta**: should be an unique identifier for the SQL database, so I <mark>shall check if there are duplicates</mark>.
* **Description**: names to use for the element. If there are more than one, it's separated by "|".
* **Otros nombres**: old names when they were discovered first.
* **Orthologs**: different orthologs in *L. major*, *L. donovani*, *L. braziliensis*.
  * **LMJFC ortólogo**: old *L. major* strand ortholog.
  * **LmjF ortólogo**: current *L. major* strand ortholog.
  * **LBRM2904 ortólogo**: *L. braziliensis* ortholog? <mark>CHECK IT</mark>
  * **LdHU3 ortólogo**: *L. donovani* ortholog.
  * **LbrM2904 ortólogo**: *L. braziliensis* ortholog.
  * **LbrM ortólogo**: *L. braziliensis* ortholog.
* **Wikidata**: ID to search in wikidata.
* **Mendeley dataset**: ID for Mendeley data.  

Apart from that, we need another column for **Uniprot** searching.

We can see that we need a lot of columns, But we shall be careful with:
* **Etiqueta**: in case there are duplicates.
* **Orthologs**: there are a lot of them and I don't know if all of them are useful.
* **Mendeley data**: two versions or what?

### 1.2.1 Take needed columns

In [121]:
needed_cols = [
    "Etiqueta",
    "Description",
    "Otros nombres",
    "Wikidata",
    "Mendeley dataset",
    "LmjF ortólogo",
    "LMJFC ortólogo",
    "LdHU3 ortólogo",
    "LBRM2904 ortólogo",
    "LbrM2904 ortólogo",
    "LbrM ortólogo"
    ]

df_main = wiki_df[needed_cols]
print(df_main.shape)
df_main.head()

(8881, 11)


Unnamed: 0,Etiqueta,Description,Otros nombres,Wikidata,Mendeley dataset,LmjF ortólogo,LMJFC ortólogo,LdHU3 ortólogo,LBRM2904 ortólogo,LbrM2904 ortólogo,LbrM ortólogo
0,LINF_010005000,Protein of unknown function (DUF2946),LinJ.01.0010,Q62005644,http://dx.doi.org/10.17632/m9mggnd5w4.1,LmjF.01.0010,LMJFC_010005100,LDHU3_01.0030,,,
1,LINF_010005100,Endonuclease/Exonuclease/phosphatase family,LinJ.01.0020,Q65246947,http://dx.doi.org/10.17632/d99ycn2bhb.1,LmjF.01.0020,LMJFC_010005200,LDHU3_01.0040,LBRM2904_01.0010,,
2,LINF_010005200,Kinesin-13,LinJ.01.0030,Q65459375,http://dx.doi.org/10.17632/vsmrprnpb5.2,LmjF.01.0030,LMJFC_010005300,LDHU3_01.0050,LBRM2904_01.0020,,
3,LINF_010005300,hypothetical protein - conserved,LinJ.01.0040,Q65461401,http://dx.doi.org/10.17632/78btgwdkx9.1,LmjF.01.0040,LMJFC_010005400,LDHU3_01.0060,LBRM2904_01.0030,,
4,LINF_010005400,Propionyl-CoA carboxylase - α-subunit,LinJ.01.0050,Q65463295,http://dx.doi.org/10.17632/b79hrctkww.1,LmjF.01.0050,LMJFC_010005500,LDHU3_01.0070,LBRM2904_01.0040,,


### 1.2.2 Check the *L. braziliensis* orthologs

Let's check all the conflicted columns:

In [122]:
# Columns to observe:
braz_ortho_cols = [
    "LBRM2904 ortólogo",
    "LbrM2904 ortólogo",
    "LbrM ortólogo"
]

In [123]:
# Check the values:
for i in range(len(braz_ortho_cols)):
    mask = braz_ortho_cols[i]
    print(f"Numpy NaN values: {df_main[mask].isna().sum()}")
    print(df_main[mask].value_counts())
    if i != len(braz_ortho_cols) - 1:
        print("\n", "="*50, "\n", sep="")

Numpy NaN values: 8613
LBRM2904 ortólogo
-                    3
LBRM2904_06.0090     2
LBRM2904_05.0520     2
LBRM2904_05.0200     1
LBRM2904_05.0160     1
                    ..
LBRM2904_02.0140     1
LBRM2904_02.0070     1
LBRM2904_02.0030     1
LBRM2904_02.0010     1
LBRM2904_02.0390     1
Name: count, Length: 264, dtype: int64


Numpy NaN values: 8879
LbrM2904 ortólogo
LBRM2904_13.1180|LBRM2904_13.1190    1
LBRM2904_28.2560                     1
Name: count, dtype: int64


Numpy NaN values: 8875
LbrM ortólogo
LBRM2904_17.1610    1
LBRM2904_25.0530    1
LBRM2904_30.2040    1
LBRM2904_30.3180    1
LBRM2904_31.2420    1
LBRM2904_20.0020    1
Name: count, dtype: int64


We can observe the following:
* **LBRM2904 ortólogo**: has 8613 NaN values (the less of the three) so this one es the main one to use.
* **LbrM2904 ortólogo**: except for 2 values, everything is NaN.
* **LbrM ortólogo**: except for 6 values, everything is NaN.

In [124]:
# Check where "LbrM2904 ortólogo* is not NaN
df_main.loc[
    df_main[braz_ortho_cols[1]].notna(),
    braz_ortho_cols
    ]

Unnamed: 0,LBRM2904 ortólogo,LbrM2904 ortólogo,LbrM ortólogo
1696,,LBRM2904_13.1180|LBRM2904_13.1190,
5069,,LBRM2904_28.2560,


We can see how **LBRM2904 ortólogo** is NaN here. I the data of **LbrM2904 ortólogo** should be in **LBRM2904 ortólogo**

In [125]:
# Check where "LbrM ortólogo" is not NaN
df_main.loc[
    df_main[braz_ortho_cols[2]].notna(),
    braz_ortho_cols
    ]

Unnamed: 0,LBRM2904 ortólogo,LbrM2904 ortólogo,LbrM ortólogo
2480,,,LBRM2904_17.1610
4016,,,LBRM2904_25.0530
5681,,,LBRM2904_30.2040
5794,,,LBRM2904_30.3180
6156,,,LBRM2904_31.2420
7063,,,LBRM2904_20.0020


My guess is the same as before, all this data should be in **LBRM2904 ortólogo**

In [126]:
# Let's merge the two columns into "LBRM2904 ortólogo"
# First with the "LbrM2904 ortólogo" column
df_main.loc[:,"LBRM2904 ortólogo"] = df_main["LBRM2904 ortólogo"].combine_first(df_main["LbrM2904 ortólogo"])

# Now with the "LbrM ortólogo" column
df_main.loc[:,"LBRM2904 ortólogo"] = df_main["LBRM2904 ortólogo"].combine_first(df_main["LbrM ortólogo"])

In [127]:
# Let's check if it worked
df_main.loc[df_main[braz_ortho_cols[1]].notna(), braz_ortho_cols]

Unnamed: 0,LBRM2904 ortólogo,LbrM2904 ortólogo,LbrM ortólogo
1696,LBRM2904_13.1180|LBRM2904_13.1190,LBRM2904_13.1180|LBRM2904_13.1190,
5069,LBRM2904_28.2560,LBRM2904_28.2560,


In [128]:
# Check the other one
df_main.loc[df_main[braz_ortho_cols[2]].notna(), braz_ortho_cols]

Unnamed: 0,LBRM2904 ortólogo,LbrM2904 ortólogo,LbrM ortólogo
2480,LBRM2904_17.1610,,LBRM2904_17.1610
4016,LBRM2904_25.0530,,LBRM2904_25.0530
5681,LBRM2904_30.2040,,LBRM2904_30.2040
5794,LBRM2904_30.3180,,LBRM2904_30.3180
6156,LBRM2904_31.2420,,LBRM2904_31.2420
7063,LBRM2904_20.0020,,LBRM2904_20.0020


Since it worked, let's remove the columns **LbrM2904 ortólo** and **LbrM ortólogo**

In [129]:
df_main = df_main.drop(columns=braz_ortho_cols[1:])
df_main.head()

Unnamed: 0,Etiqueta,Description,Otros nombres,Wikidata,Mendeley dataset,LmjF ortólogo,LMJFC ortólogo,LdHU3 ortólogo,LBRM2904 ortólogo
0,LINF_010005000,Protein of unknown function (DUF2946),LinJ.01.0010,Q62005644,http://dx.doi.org/10.17632/m9mggnd5w4.1,LmjF.01.0010,LMJFC_010005100,LDHU3_01.0030,
1,LINF_010005100,Endonuclease/Exonuclease/phosphatase family,LinJ.01.0020,Q65246947,http://dx.doi.org/10.17632/d99ycn2bhb.1,LmjF.01.0020,LMJFC_010005200,LDHU3_01.0040,LBRM2904_01.0010
2,LINF_010005200,Kinesin-13,LinJ.01.0030,Q65459375,http://dx.doi.org/10.17632/vsmrprnpb5.2,LmjF.01.0030,LMJFC_010005300,LDHU3_01.0050,LBRM2904_01.0020
3,LINF_010005300,hypothetical protein - conserved,LinJ.01.0040,Q65461401,http://dx.doi.org/10.17632/78btgwdkx9.1,LmjF.01.0040,LMJFC_010005400,LDHU3_01.0060,LBRM2904_01.0030
4,LINF_010005400,Propionyl-CoA carboxylase - α-subunit,LinJ.01.0050,Q65463295,http://dx.doi.org/10.17632/b79hrctkww.1,LmjF.01.0050,LMJFC_010005500,LDHU3_01.0070,LBRM2904_01.0040
