In [1]:
import os
import csv
import pickle
import pprint
import sqlite3
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

DATA = "/Users/andy/data/proteins"
LIMIT = 5000
intact_f = f"{DATA}/intact.txt"
biogrid_f = f"{DATA}/BIOGRID-ALL-4.4.222.tab3.txt"
db = sqlite3.connect(f"{DATA}/proteins.db")


def load_raw_data():
    intact_dups = []
    intact_processed = []
    with open(intact_f, "r") as i_f:
        intact = csv.reader(i_f, delimiter=",")
        original_count = 0
        headings = next(intact)
        progress = 0
        for line in intact:
            progress += 1
            line_with_meta = dict(zip(headings[0].split("\t"), line[0].split("\t")))
            original_count += 1
            a = line_with_meta["#ID(s) interactor A"]
            a = a.replace('"', "")
            a = a.replace("uniprotkb:", "")
            a = a.replace("chebi:CHEBI:", "")
            a = a.replace("intact:EBI-", "")
            a = a.replace("ensembl:ENSG00000", "")
            b = line_with_meta["ID(s) interactor B"]
            b = b.replace('"', "")
            b = b.replace("uniprotkb:", "")
            b = b.replace("ensembl:ENSG00000", "")
            b = b.replace("intact:EBI-", "")
            key = tuple(sorted((a, b)))
            line_with_meta["akey"] = key[0]
            line_with_meta["bkey"] = key[1]
            if key not in intact_dups:
                intact_dups.append(key)
                line_with_meta["dup"] = False
            else:
                line_with_meta["dup"] = True
            line_with_meta["id"] = progress
            intact_processed.append(line_with_meta)
            if progress > LIMIT:
                break

            # pprint.pprint(intact_processed[line_with_meta["key"]])
    biogrid_dups = []
    biogrid_matches = {}
    match_number = 0
    progress = 0
    biogrid_processed = []
    with open(biogrid_f, "r") as b_f:
        biogrid = csv.DictReader(b_f, delimiter="\t", quoting=csv.QUOTE_NONE)
        for line in biogrid:
            progress += 1
            sa = line["SWISS-PROT Accessions Interactor A"].replace("-", "").strip()
            ta = line["TREMBL Accessions Interactor A"].replace("-", "").strip()
            sb = line["SWISS-PROT Accessions Interactor B"].replace("-", "").strip()
            tb = line["TREMBL Accessions Interactor B"].replace("-", "").strip()
            # print(sa, sb, ta, tb)
            key = (sa, sb)
            line["akey"] = sa
            line["bkey"] = sb
            if key not in biogrid_dups:
                biogrid_dups.append(key)
                line["dup"] = False
            else:
                line["dup"] = True
                line["id"] = progress
            biogrid_processed.append(line)
            key = (ta, tb)
            line["akey"] = ta
            line["bkey"] = tb
            if key not in biogrid_dups:
                biogrid_dups.append(key)
                line["dup"] = False
            else:
                line["dup"] = True
                line["id"] = progress
            biogrid_processed.append(line)

            """for a in line["key"][0]:
                for a_i in a:
                    for b in line["key"][1]:
                        for b_i in b:
                            # print(a_i.strip(), b_i.strip())
                            key = tuple(sorted((a_i, b_i)))
                            line["akey"] = key[0]
                            line["bkey"] = key[1]
                            if key not in biogrid_dups:
                                biogrid_dups.append(key)
                                line["dup"] = False
                            else:
                                line["dup"] = True
                            line["id"] = progress
                            biogrid_processed.append(line)"""

            if progress > LIMIT:
                break
        intact = pd.DataFrame(intact_processed)
        biogrid = pd.DataFrame(biogrid_processed)
        print(intact.head())
        print(biogrid.head())
        return intact_processed, biogrid_processed


def match():
    """not needed like this??"""
    if (a_i, b_i) in intact_processed.keys():
        print(f"MATCH {match_number}")
        match_number += 1
        line["match"] = match_number
        # intact_processed["key"]["match"] = match_number
        # print(f"{intact_processed['key']['match']}")
        # print(f"{intact_processed['key']['match']}")
        # continue
    biogrid_matches[line["key"]] = line
    # pprint.pprint(biogrid_matches[line["key"]])
    biogrid_matches[line["id"]] = line


def write_pickles():
    with open("intact_matches.pkl", "wb") as im:
        pickle.dump(intact_processed, im)

    with open("biogrid_matches.pkl", "wb") as bm:
        pickle.dump(biogrid_matches, bm)
    print("=" * 80)
    count = 0
    for item in intact_processed.items():
        count += 1
        pprint.pprint(item)
        print("-" * 80)
        """if count > 20:
            break"""
    print(f"Intact matches {count}")
    count = 0
    for item in biogrid_matches.items():
        count += 1
        pprint.pprint(item)
        print("-" * 80)
        """if count > 20:
            break"""
    print(f"Biogrid matches {count}")


"""add control total for all catgories"""


def read_files():
    with open("biogrid_matches.pkl", "rb") as bm:
        biogrid_matches = pickle.load(bm)
    with open("intact_matches.pkl", "rb") as im:
        intact_matches = pickle.load(im)
    return intact_matches, biogrid_matches


import pandas as pd


def display_menu():
    print(
        """
          1. Load raw files
          2. Match protein data
          3. List match data
          4. Save match data
          5. Load match data
          9. Quit
          option = input()
          """
    )
    option = input()
    return option


def main():
    option = display_menu()
    if option == "1":
        intact, biogrid = load_raw_data()
    exit()


if __name__ == "__main__":
    main()


          1. Load raw files
          2. Match protein data
          3. List match data
          4. Save match data
          5. Load match data
          9. Quit
          option = input()
          
  #ID(s) interactor A  ID(s) interactor B  \
0    uniprotkb:P49418    uniprotkb:O43426   
1  intact:EBI-7121639    uniprotkb:P49418   
2  intact:EBI-7121654    uniprotkb:P49418   
3  intact:EBI-7121715    uniprotkb:P49418   
4    uniprotkb:P49418  intact:EBI-7121765   

                                                                                                                                                                  Alt. ID(s) interactor A  \
0  intact:EBI-7121510|uniprotkb:Q75MK5|uniprotkb:Q75MM3|uniprotkb:A4D1X9|intact:MINT-109264|uniprotkb:O43538|uniprotkb:A4D1X8|uniprotkb:Q75MJ8|uniprotkb:Q8N4G0|ensembl:ENSP00000348602.2   
1                                                                                                                                                  

: 