In [1]:
# Importa las bibliotecas necesarias
from pyspark.sql import SparkSession

# Crea una instancia de SparkSession
spark = SparkSession.builder.appName("Ejemplo").getOrCreate()

# Reemplaza 'ruta/al/archivo.txt' con la ruta completa de tu archivo
path_1 = 'generifs_basic\generifs_basic.txt'
#https://www.ncbi.nlm.nih.gov/gene/about-generif
path_2 = 'hiv_interactions\hiv_interactions.txt'
path_3 = 'hiv_siRNA_interactions\hiv_siRNA_interactions.txt'
path_4 = 'interactions\interactions.txt'


# Lee el archivo de texto en un DataFrame
df_gen_ba = spark.read.csv(path_1, header=True, inferSchema=True, sep='\t')
df_hiv_int = spark.read.csv(path_2, header=True, inferSchema=True, sep='\t')
df_hiv_sirna = spark.read.csv(path_3, header=True, inferSchema=True, sep='\t')
df_int = spark.read.csv(path_4, header=True, inferSchema=True, sep='\t')

# Muestra los primeros registros del DataFrame
#df_gen_ba.show(vertical=True, truncate=False)
#df_hiv_int.show(vertical=True, truncate=False)
#df_hiv_sirna.show(vertical=True, truncate=False)
#df_int.show(vertical=True, truncate=False)

# Mostrar el campo de los comentarios
#df_hiv_int.select("GeneRIF text").show(truncate=False)
#df_hiv_sirna.select("GeneRIF text").show(truncate=False)


In [4]:
from pyspark.sql.functions import col, desc, count, countDistinct, explode, split, size

In [5]:
path = 'biogrid\BIOGRID-ALL-4.4.229.tab3.txt.txt'
# Lee el archivo de texto en un DataFrame
df_all = spark.read.csv(path, header=True, inferSchema=True, sep='\t')

In [6]:
df_homo_sapiens = df_all.where( (col('Organism Name Interactor A') == 'Homo sapiens') & (col('Organism Name Interactor B') == 'Homo sapiens') )

In [7]:
df_homo_sapiens.count()

1119345

In [8]:
df_int.count()

4140525

In [9]:
df_homo_sapiens.show(2, vertical = True)

-RECORD 0--------------------------------------------------
 #BioGRID Interaction ID            | 103                  
 Entrez Gene Interactor A           | 6416                 
 Entrez Gene Interactor B           | 2318                 
 BioGRID ID Interactor A            | 112315               
 BioGRID ID Interactor B            | 108607               
 Systematic Name Interactor A       | -                    
 Systematic Name Interactor B       | -                    
 Official Symbol Interactor A       | MAP2K4               
 Official Symbol Interactor B       | FLNC                 
 Synonyms Interactor A              | JNKK|JNKK1|MAPKK4... 
 Synonyms Interactor B              | ABP-280|ABP280A|A... 
 Experimental System                | Two-hybrid           
 Experimental System Type           | physical             
 Author                             | Marti A (1997)       
 Publication Source                 | PUBMED:9006895       
 Organism ID Interactor A           | 96

In [10]:
df_int.show(2, vertical = True)

-RECORD 0-----------------------------------
 #tax_id             | 358                  
 gene_id             | 1224321              
 accn.vers2          | NP_059802.1          
 name3               | hypothetical prot... 
 keyphrase           | -                    
 tax_id              | -                    
 interactant_id      | -                    
 interactant_id_type | -                    
 accn.vers8          | AE008690.1           
 name9               | Agrobacterium tum... 
 complex_id          | -                    
 complex_id_type     | -                    
 complex_name        | -                    
 pubmed_id_list      | 15155952             
 last_mod            | 2005-01-18 14:45:00  
 generif_text        | VirB4 interacts w... 
 interaction_id      | 134324               
 interaction_id_type | BIND                 
-RECORD 1-----------------------------------
 #tax_id             | 358                  
 gene_id             | 1224322              
 accn.vers

In [11]:
resultado = df_int.join(df_homo_sapiens, (df_int["gene_id"] == df_homo_sapiens["Entrez Gene Interactor A"])\
                        & (df_int["interactant_id"] == df_homo_sapiens["Entrez Gene Interactor B"]), "inner")

In [12]:
resultado.count()

1308840

In [13]:
resultado.groupBy('interaction_id_type').agg(count('*').alias('conteo')).sort(col('conteo')).show()

+-------------------+-------+
|interaction_id_type| conteo|
+-------------------+-------+
|               BIND|  24423|
|               HPRD|  93400|
|            BioGRID|1191017|
+-------------------+-------+



In [14]:
resultado.groupBy('Experimental System Type').agg(count('*').alias('conteo')).sort(col('conteo')).show()

+------------------------+-------+
|Experimental System Type| conteo|
+------------------------+-------+
|                 genetic|  25292|
|                physical|1283548|
+------------------------+-------+



In [16]:
resultado.where(col('Experimental System Type') == 'genetic').show(2, vertical = True)

-RECORD 0--------------------------------------------------
 #tax_id                            | 9606                 
 gene_id                            | 15                   
 accn.vers2                         | -                    
 name3                              | -                    
 keyphrase                          | -                    
 tax_id                             | 9606                 
 interactant_id                     | 6755                 
 interactant_id_type                | GeneID               
 accn.vers8                         | -                    
 name9                              | -                    
 complex_id                         | -                    
 complex_id_type                    | -                    
 complex_name                       | -                    
 pubmed_id_list                     | 28319085             
 last_mod                           | 2019-04-07 11:20:00  
 generif_text                       | Ne

In [15]:
df_con_conteo = resultado.withColumn("conteo_de_publicaciones", size(split(col("pubmed_id_list"), ",")))
conteo_valores = df_con_conteo.groupBy("conteo_de_publicaciones").agg(count("*").alias("conteo_de_valores"))
conteo_valores.sort(col('conteo_de_valores').desc()).show()

+-----------------------+-----------------+
|conteo_de_publicaciones|conteo_de_valores|
+-----------------------+-----------------+
|                      1|           885579|
|                      2|           191501|
|                      3|            90871|
|                      4|            36523|
|                      5|            22110|
|                      6|            15352|
|                      7|            11231|
|                      8|             9529|
|                      9|             6513|
|                     10|             5348|
|                     11|             3903|
|                     12|             3635|
|                     13|             2760|
|                     14|             2105|
|                     15|             1822|
|                     16|             1483|
|                     17|             1309|
|                     18|             1131|
|                     19|              981|
|                     23|       

In [17]:
df_2 = df_con_conteo.where(~(col('conteo_de_publicaciones') == 1))

In [18]:
df_2.count()

423261

In [24]:
df_filtrado = df_2.drop_duplicates(['gene_id','interactant_id','Entrez Gene Interactor A','Entrez Gene Interactor B'])

In [25]:
df_filtrado.count()

83592

In [22]:
df_filtrado.show(3, vertical = True)

-RECORD 0--------------------------------------------------
 #tax_id                            | 9606                 
 gene_id                            | 2                    
 accn.vers2                         | -                    
 name3                              | -                    
 keyphrase                          | -                    
 tax_id                             | 9606                 
 interactant_id                     | 56983                
 interactant_id_type                | GeneID               
 accn.vers8                         | -                    
 name9                              | -                    
 complex_id                         | -                    
 complex_id_type                    | -                    
 complex_name                       | -                    
 pubmed_id_list                     | 28514442,33961781    
 last_mod                           | 2021-11-07 10:23:00  
 generif_text                       | Af

In [27]:
df_homo_sapiens.where( (col('Entrez Gene Interactor A') == '2' ) & (col('Entrez Gene Interactor B') == '56983' ) ).show(vertical = True)

-RECORD 0--------------------------------------------------
 #BioGRID Interaction ID            | 2260538              
 Entrez Gene Interactor A           | 2                    
 Entrez Gene Interactor B           | 56983                
 BioGRID ID Interactor A            | 106524               
 BioGRID ID Interactor B            | 121300               
 Systematic Name Interactor A       | -                    
 Systematic Name Interactor B       | MDS010               
 Official Symbol Interactor A       | A2M                  
 Official Symbol Interactor B       | POGLUT1              
 Synonyms Interactor A              | A2MD|CPAMD5|FWP00... 
 Synonyms Interactor B              | C3orf9|CLP46|KDEL... 
 Experimental System                | Affinity Capture-MS  
 Experimental System Type           | physical             
 Author                             | Huttlin EL (2017)    
 Publication Source                 | PUBMED:28514442      
 Organism ID Interactor A           | 96

In [28]:
# En la de Biogrid hay un registro para cada publicación, mientras que en la de interactions las publicaciones están en una lista.

In [29]:
from Bio import Entrez
from Bio import SeqIO

def obtener_secuencia_nucleotidos(identificador_gene):
    Entrez.email = "gcanomartin@alumni.unav.es"  # Es importante proporcionar tu correo electrónico a NCBI
    handle = Entrez.efetch(db="gene", id=identificador_gene, rettype="gb", retmode="text")
    gene_record = SeqIO.read(handle, "genbank")
    handle.close()
    
    # Extraer la secuencia de nucleótidos
    secuencia_nucleotidos = gene_record.seq
    
    return secuencia_nucleotidos

# Identificador Entrez Gene del gen de interés
identificador_gene = "2"

# Obtener y mostrar la secuencia de nucleótidos
secuencia = obtener_secuencia_nucleotidos(identificador_gene)
print("Secuencia de nucleótidos:")
print(secuencia)


ValueError: No records found in handle