In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
# open the training text and split it by || delimiter
with open("training_text", 'r') as f:
    data = np.array([line.split('||') for line in f])

In [3]:
# open variants and split by ,
with open("training_variants", 'r') as f:
    variants = np.array([line.split(",") for line in f])

In [4]:
#size
data.shape, variants.shape

((3322,), (3322, 4))

In [5]:

r, c = np.unique([len(d) for d in data], return_counts=True)
[x for x in zip(r, c)]

[(1, 1), (2, 3321)]

In [6]:
# add it all to a dataframe, use headers in variants as names
df = pd.DataFrame()
df["text"] = [d[1] for d in data[1:]]
for i, h in enumerate(variants[0]):
    df[h] = [v[i] for v in variants[1:]]

In [7]:
df

Unnamed: 0,text,ID,Gene,Variation,Class
0,Cyclin-dependent kinases (CDKs) regulate a var...,0,FAM58A,Truncating Mutations,1\n
1,Abstract Background Non-small cell lung canc...,1,CBL,W802*,2\n
2,Abstract Background Non-small cell lung canc...,2,CBL,Q249E,2\n
3,Recent evidence has demonstrated that acquired...,3,CBL,N454D,3\n
4,Oncogenic mutations in the monomeric Casitas B...,4,CBL,L399V,4\n
5,Oncogenic mutations in the monomeric Casitas B...,5,CBL,V391I,4\n
6,Oncogenic mutations in the monomeric Casitas B...,6,CBL,V430M,5\n
7,CBL is a negative regulator of activated recep...,7,CBL,Deletion,1\n
8,Abstract Juvenile myelomonocytic leukemia (JM...,8,CBL,Y371H,4\n
9,Abstract Juvenile myelomonocytic leukemia (JM...,9,CBL,C384R,4\n


In [8]:
# get the counts of words in the first document
r, c = np.unique(re.split(' ',df["text"].iloc[0]), return_counts = True)
sample_word_freq = [x for x in zip(r, c)]
sample_word_freq.sort(key=lambda x: x[1], reverse=True)
# look at the top 50
sample_word_freq[:50]

[('the', 224),
 ('of', 209),
 ('and', 208),
 ('in', 160),
 ('a', 128),
 ('M', 84),
 ('cyclin', 82),
 ('to', 79),
 ('with', 76),
 ('CDK10', 64),
 ('We', 54),
 ('that', 52),
 ('or', 46),
 ('ETS2', 44),
 ('by', 43),
 ('we', 41),
 ('(Fig.', 39),
 ('expression', 39),
 ('is', 39),
 ('cells', 35),
 ('STAR', 33),
 ('as', 33),
 ('protein', 32),
 ('FAM58A', 30),
 ('The', 29),
 ('levels', 28),
 ('on', 28),
 ('from', 27),
 ('mM', 26),
 ('1', 25),
 ('Fig.', 24),
 ('an', 24),
 ('are', 23),
 ('which', 23),
 ('cell', 21),
 ('CDK10/cyclin', 19),
 ('for', 19),
 ('was', 18),
 ('Western', 17),
 ('at', 17),
 ('proteins', 17),
 ('', 16),
 ('MCF7', 16),
 ('between', 16),
 ('interaction', 16),
 ('kinase', 16),
 ('not', 16),
 ('silencing', 16),
 ('analysis', 15),
 ('expressed', 15)]

In [9]:
# remove line breaks
df["class"] = [re.findall(r'\d+', v)[0] for v in df["Class\n"]]

In [10]:
# remove extra columns
df = df.drop("Class\n",1).drop("ID", 1)

In [11]:
df

Unnamed: 0,text,Gene,Variation,class
0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1
1,Abstract Background Non-small cell lung canc...,CBL,W802*,2
2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2
3,Recent evidence has demonstrated that acquired...,CBL,N454D,3
4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4
5,Oncogenic mutations in the monomeric Casitas B...,CBL,V391I,4
6,Oncogenic mutations in the monomeric Casitas B...,CBL,V430M,5
7,CBL is a negative regulator of activated recep...,CBL,Deletion,1
8,Abstract Juvenile myelomonocytic leukemia (JM...,CBL,Y371H,4
9,Abstract Juvenile myelomonocytic leukemia (JM...,CBL,C384R,4


In [12]:
# count the lasses
classes, c = np.unique(df["class"], return_counts=True)
[x for x in zip(classes, c)]

[('1', 568),
 ('2', 452),
 ('3', 89),
 ('4', 686),
 ('5', 242),
 ('6', 275),
 ('7', 953),
 ('8', 19),
 ('9', 37)]

In [13]:
classes

array(['1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype=object)

In [14]:
#how many genes
genes,c = np.unique(df["Gene"], return_counts=True)
genes_count = [x for x in zip(genes, c)]

len(genes_count)

264

In [15]:
# how many variants
var,c = np.unique(df["Variation"], return_counts=True)
var_counts = [x for x in zip(var, c)]

len(var_counts)

2996

In [16]:
len(genes)

264

In [17]:
genes[:30]

array(['ABL1', 'ACVR1', 'AGO2', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'APC', 'AR',
       'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2',
       'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXL', 'B2M',
       'BAP1', 'BARD1', 'BCL10', 'BCL2', 'BCL2L11', 'BCOR'], dtype=object)

In [18]:
var[:20]

array(['1_2009trunc', '2010_2471trunc', '256_286trunc', "3' Deletion",
       '385_418del', '422_605trunc', '533_534del', '534_536del',
       '550_592del', '560_561insER', '596_619splice', '963_D1010splice',
       '981_1028splice', 'A1020V', 'A1022E', 'A1065T', 'A1066V', 'A1099T',
       'A111P', 'A1131T'], dtype=object)

In [19]:
# how many of the top 50 papers mention the gene
len([1 for i in range(50) if df["Gene"].iloc[i] in re.split(' ',df["text"].iloc[i])])

47

In [20]:
# how many times is the gene mentioned in each paper
genes_per_paper = [len(re.findall(df["Gene"].iloc[i], df["text"].iloc[i])) for i in range(100)]

sorted(genes_per_paper)[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [21]:
# max number of times a gene is mentioned per paper
sorted(genes_per_paper, reverse=True)[:10]

[476, 454, 443, 411, 386, 386, 360, 336, 315, 309]

In [168]:
# same with variants
vars_per_paper = [len(re.findall(df["Variation"].iloc[i], df["text"].iloc[i])) for i in range(100)]

In [170]:
sorted(vars_per_paper, reverse=True)[:10]

[143, 112, 107, 93, 49, 49, 45, 34, 33, 32]

In [171]:
sorted(vars_per_paper)[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [23]:
# numbers and text need more processing - probably a library somewhere to save time
def simplify_text(text):   
         return [("".join([c.lower() for c in t if c not in ",.\"()[]{}"])) for t in re.split(' ', text)]
    

In [24]:
test = simplify_text(df["text"].iloc[0])

In [25]:
# have another look at the word counts
words_0, c = np.unique(test, return_counts= True)
counts = [x for x in zip(words_0,c)]

In [30]:
sorted(counts, reverse=True, key=lambda x: x[1])[:50]

[('the', 253),
 ('and', 212),
 ('of', 210),
 ('in', 171),
 ('a', 147),
 ('we', 95),
 ('m', 92),
 ('cyclin', 89),
 ('to', 89),
 ('cdk10', 76),
 ('with', 76),
 ('fig', 63),
 ('ets2', 53),
 ('that', 53),
 ('cells', 52),
 ('or', 49),
 ('by', 45),
 ('expression', 43),
 ('is', 39),
 ('as', 38),
 ('protein', 36),
 ('levels', 35),
 ('fam58a', 33),
 ('star', 33),
 ('1', 31),
 ('on', 30),
 ('from', 27),
 ('mm', 26),
 ('an', 25),
 ('proteins', 24),
 ('are', 23),
 ('which', 23),
 ('cell', 22),
 ('silencing', 22),
 ('these', 22),
 ('for', 21),
 ('syndrome', 21),
 ('kinase', 20),
 ('cdk10/cyclin', 19),
 ('at', 18),
 ('supplementary', 18),
 ('was', 18),
 ('2', 17),
 ('5', 17),
 ('analysis', 17),
 ('interaction', 17),
 ('western', 17),
 ('', 16),
 ('4', 16),
 ('between', 16)]

In [27]:
# look at the next paper..
text_1 = simplify_text(df["text"].iloc[1])
words_1, c = np.unique(text_1, return_counts=True)
counts = [x for x in zip(words_1,c)]

In [28]:
sorted(counts, reverse=True, key = lambda x: x[1])

[('the', 281),
 ('and', 204),
 ('in', 203),
 ('of', 183),
 ('c-cbl', 153),
 ('were', 98),
 ('to', 80),
 ('with', 68),
 ('mutations', 66),
 ('that', 65),
 ('for', 62),
 ('', 61),
 ('a', 60),
 ('lung', 56),
 ('was', 54),
 ('cell', 45),
 ('egfr', 43),
 ('mutation', 43),
 ('is', 41),
 ('cells', 38),
 ('at', 37),
 ('as', 35),
 ('met', 33),
 ('samples', 33),
 ('we', 33),
 ('cancer', 32),
 ('or', 31),
 ('loh', 29),
 ('are', 27),
 ('figure', 26),
 ('patients', 26),
 ('by', 24),
 ('from', 24),
 ('using', 24),
 ('1', 23),
 ('domain', 23),
 ('on', 23),
 ('also', 21),
 ('an', 21),
 ('mutants', 19),
 ('not', 19),
 ('tumor', 18),
 ('taiwanese', 17),
 ('2', 16),
 ('different', 16),
 ('have', 16),
 ('it', 16),
 ('this', 16),
 ('three', 16),
 ('region', 15),
 ('3', 14),
 ('5', 14),
 ('activity', 14),
 ('be', 14),
 ('had', 14),
 ('shown', 14),
 ('study', 14),
 ('these', 14),
 ('analysis', 13),
 ('cancers', 13),
 ('dna', 13),
 ('finger', 13),
 ('q249e', 13),
 ('ring', 13),
 ('transfected', 13),
 ('w802*'

In [29]:
[x for x in words_0 if x in words_1][:20] # which words do the papers share

['',
 '\n',
 '01%',
 '02',
 '05%',
 '1',
 '10',
 '10%',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '15%',
 '16',
 '17',
 '18',
 '19',
 '1a']

In [40]:
gene_1 = df["Gene"].iloc[1]

In [52]:
# which genes are mentioned .... i think this code is wrong.
gene_counts = []
for i in range(100):
    
    text = simplify_text(df["text"].iloc[i])
    gene = simplify_text(df["Gene"].iloc[i])

    freq = [x for x in text if x == gene]
    if len(freq) > 0:
        gene_counts.append((gene, len(shape)))

In [53]:
gene_counts

[]