In [2]:
from __future__ import division
import pandas as pd
import sys
from sys import argv
import re
import numpy as np
from nltk.tokenize import sent_tokenize
import glob
import os
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [3]:
def open_entities(ents_name):

	ents = pd.read_csv(ents_name, sep = '\t', names = ['Given_ID', 'UNUSED', 'Real_ID'])

	return ents

In [4]:
def open_nec_pairs(pairs_name):
    
    nec_pairs = pd.read_csv(pairs_name, sep = '\t', names = ["Entity_name_x","SerialNo_x", "Entity_name_y", "SerialNo_y", "Text"])
    
    return nec_pairs

In [5]:
entities = open_entities("./yeast/yeast_entities.tsv")
pairs_nec = open_nec_pairs("./Results/Pairs_With_Sentences_Only_nec.tsv")

In [6]:
pairs_nec[["SerialNo_x", "SerialNo_y"]] = pairs_nec[["SerialNo_x", "SerialNo_y"]].astype(int)

In [11]:
pairs_copy = pairs_nec.copy()
for i,r in pairs_copy.iterrows():
    
    #Add Real_ID's
    if r["SerialNo_x"] in set(entities["Given_ID"].tolist()) and r["SerialNo_y"] in set(entities["Given_ID"].tolist()):
        x = entities.loc[entities["Given_ID"] == r["SerialNo_x"], ("Real_ID")]
        y = entities.loc[entities["Given_ID"] == r["SerialNo_y"], ("Real_ID")]
        pairs_copy.set_value(i, "Real_ID_x", x.values[0])
        pairs_copy.set_value(i, "Real_ID_y", y.values[0])
        
            
    #Mask entity names in text
    try:
        sub1 = re.sub(r["Entity_name_x"], '', r["Text"])
        sub2 = re.sub(r["Entity_name_y"], '', sub1)
    except re.error:
        print "Couldn't mask ", r["Entity_name_y"], "\n", "Trying a different way"
        sub2 = sub1.replace(r["Entity_name_y"], "")
        print sub2
    
#     pairs_copy.set_value(i, "Text", sub2)

Couldn't mask  SOS1 
Trying a different way
Knockdown of PIP4K2A, CCT5, CMBL, , KMO and OPN3, genes within 200 kb up-/downstream of the 3 SNPs that were associated with SCLC overall survival (rs1778335, rs2662411 and rs7519667), significantly desensitized H196 to paclitaxel.
Couldn't mask  inorganic pyrophosphatase 
Trying a different way
BP variability also significantly decreased: of diurnal SBP/.8/4.1 mm Hg (21.7/24.7%), of nocturnal SBP/DBP - 2.8/4.1 mm Hg (17.9/19.6%).
Couldn't mask  superoxide dismutase [Cu-Zn 
Trying a different way
Among them, 9 proteins (translation elongation factor eEF-1 alpha chain,  III, annexin II, calcyclin, fab fragment anti-VEGF antibody, peroxiredoxin-2, ], stefin A3, and calgranulin-B) were common and showed similar expression pattern in glyphosate and TPA-treated mouse skin.
Couldn't mask  ORM1 
Trying a different way
Embryos cultured in 1.69mM arginine had lower SLC7A1 levels and a higher abundance of messages involved with glycolysis (hexokinase 1

KeyboardInterrupt: 

'teing'

In [22]:
bow_df = pairs_copy[["Real_ID_x", "Real_ID_y", "Text"]]

In [23]:
grouped = bow_df.groupby(["Real_ID_x", "Real_ID_y"])

In [24]:
grouped_text = grouped['Text'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [25]:
col_names = ["Real_ID_x", "Real_ID_y", "Text"]
temp_df = pd.DataFrame(columns = col_names)
exclude = []
for i,r in grouped_text.iterrows():
    pair = [r["Real_ID_x"], r["Real_ID_y"]]
    #Check if there is a reverse version of the pair
    if grouped_text.query('@pair[0] == Real_ID_y and @pair[1] == Real_ID_x').empty: 
        pass
    elif r.name not in exclude: #necessary to not catch the original
        temp = grouped_text.query('@pair[0] == Real_ID_y and @pair[1] == Real_ID_x')
        exclude.append(temp.index[0])
        temp_df = temp_df.append(temp, ignore_index = True)

#Switch column names
col_list = list(temp_df)
col_list[0], col_list[1] = col_list[1], col_list[0]
temp_df.columns = col_list
temp_df = temp_df[["Real_ID_x", "Real_ID_y", "Text"]]

KeyboardInterrupt: 

In [119]:
new_grouped_text = grouped_text.append(temp_df, ignore_index = True).groupby(["Real_ID_x", "Real_ID_y"])

In [120]:
grouped_final = new_grouped_text['Text'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [126]:
def open_interactions(interactions_name):
	"""Opens a TSV File Containing the Interactions from STRING DB

	INPUT: Name of the tsv file

	OUTPUT: Pandas DataFrame of the .tsv file"""
	
	interactions_file = pd.read_csv(interactions_name, sep = '\t')

	return interactions_file

In [148]:
def check_interaction(interactions, tries = 0):
    desired_interaction = raw_input("What interaction will be positive?: ")
    if desired_interaction in interactions["mode"].tolist():
        return desired_interaction
    elif tries < 10:
        print "There isn't such interaction. Try again"
        tries+=1
        check_interaction(interactions, tries)
    elif tries == 10:
        print "You are hopeless. Please check interaction again! Defaulting to physical binding!"
        desired_interaction = "binding"
        return desired_interaction

In [178]:
interactions = open_interactions("./4932.protein.actions.v10.txt/4932.protein.actions.v10.txt")


interactions["item_id_a"] = interactions["item_id_a"].str[5:]
interactions["item_id_b"] = interactions["item_id_b"].str[5:]
interactions = interactions[["item_id_a", "item_id_b", "mode"]]
desired_interaction = check_interaction(interactions, 0)

for i,r in grouped_final.iterrows():
    pre_mode = interactions.loc[interactions["item_id_a"] == r["Real_ID_x"]]
    mode = pre_mode.loc[pre_mode["item_id_b"] == r["Real_ID_y"], ("mode")]
    if mode.empty or desired_interaction not in mode.values:
        grouped_final.set_value(i, "Mode", 0)
    elif desired_interaction in mode.values:
        grouped_final.set_value(i, "Mode", 1)
        
grouped_final["Mode"] = grouped_final["Mode"].astype(int)
grouped_final = grouped_final[["Real_ID_x", "Real_ID_y", "Mode", "Text"]]

What interaction will be positive?: binding


In [30]:
def split_train_test(bow_df):

	data = bow_df[["Real_ID_x", "Real_ID_y", "Text"]]
	labels = bow_df["Mode"]

	test_s = input("What is the size of test? (0-1, float): ")
	ran_state = input("Set random state (int): ")

	data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=test_s, random_state=ran_state)

	return data_train.reset_index(drop=True), data_test.reset_index(drop=True), labels_train.reset_index(drop=True), labels_test.reset_index(drop=True)

In [243]:
data_train, data_test, labels_train, labels_test = split_train_test(grouped_final)

What is the size of test? (0-1, float): 0.3
Set random state (int): 1993


In [31]:
def texts_to_words( raw_text ):
    # Function to convert a raw text to a string of words
    # The input is a single string (a raw text), and 
    # the output is a single string (a preprocessed text)
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z0-9]", " ", raw_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [32]:
data_train, data_test, labels_train, labels_test = split_train_test(result_bow_df)

# Get the number of reviews based on the dataframe column size
num_texts = data_train["Text"].size
# Initialize an empty list to hold the clean reviews
print "Cleaning and parsing the training set article sentences...\n"
clean_train_texts = []
for i in xrange( 0, num_texts ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Texts %d of %d\n" % ( i+1, num_texts )                                                                    
    clean_train_texts.append( texts_to_words( data_train["Text"][i] ))

print "Creating the bag of words...\n"
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_texts)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

print "Training the random forest..."

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, labels_train)

num_texts = len(data_test["Text"])
clean_test_texts = [] 

print "Cleaning and parsing the test set movie reviews...\n"
for i in xrange(0,num_texts):
    if( (i+1) % 1000 == 0 ):
        print "Review %d of %d\n" % (i+1, num_texts)
    clean_texts = texts_to_words( data_test["Text"][i] )
    clean_test_texts.append( clean_texts )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_texts)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
print "Predicting based on model..."
result = forest.predict(test_data_features)
print "Done!"

What is the size of test? (0-1, float): 0.3
Set random state (int): 1993
Cleaning and parsing the training set article sentences...

Texts 1000 of 22558

Texts 2000 of 22558

Texts 3000 of 22558

Texts 4000 of 22558

Texts 5000 of 22558

Texts 6000 of 22558

Texts 7000 of 22558

Texts 8000 of 22558

Texts 9000 of 22558

Texts 10000 of 22558

Texts 11000 of 22558

Texts 12000 of 22558

Texts 13000 of 22558

Texts 14000 of 22558

Texts 15000 of 22558

Texts 16000 of 22558

Texts 17000 of 22558

Texts 18000 of 22558

Texts 19000 of 22558

Texts 20000 of 22558

Texts 21000 of 22558

Texts 22000 of 22558

Creating the bag of words...

Training the random forest...
Cleaning and parsing the test set movie reviews...

Review 1000 of 9669

Review 2000 of 9669

Review 3000 of 9669

Review 4000 of 9669

Review 5000 of 9669

Review 6000 of 9669

Review 7000 of 9669

Review 8000 of 9669

Review 9000 of 9669

Predicting based on model...
Done!


In [317]:
def get_accuracy(l_new, l_te):
    """Calculates the accuracy of predicted labels, based on the given labels

    INPUT: New(Predicted) Labels, Test Labels

    OUTPUT: Accuracy in percent """

    acc = 0
    for i in range(len(l_te)):
        if l_new[i] == l_te[i]:
            acc += 1
    acc = acc/len(l_te)

    return 1-acc

In [None]:
error = get_accuracy(pred_labels, labels_test)

In [282]:
error

0.15875478332816217

In [26]:
def open_bow_df(bow_name):
    
    bow_df_file = pd.read_csv(bow_name, sep = "\t")
    
    return bow_df_file

In [27]:
result_bow_df = open_bow_df("./Results/Bag_of_Words_df.tsv")

In [28]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='adam', alpha=1e-5, random_state=1)

model = clf.fit(train_data_features, labels_train)

NameError: name 'train_data_features' is not defined

In [324]:
pred_labels = clf.predict(test_data_features)