# View Structure Subsequences 

#### Set file name of the dataset, and run this cell once to initialize the program.
- Dataset columns should have "ProteinID" and "PeptideSequence"

In [None]:
from scripts.sequence_viewer import *

filename = 'labled proteins data set.csv' 
data = pd.read_csv(os.path.join("data",filename)) 

#### Choose any ProteinID to view selected sequences
- default color order is: red | yellow | blue | green | (repeated if there are more than 4 columns of intensities)
- color is determined by the maximum value amoung those columns
- if no numbers exist for the subsequence (only overnight digestion), it is colored cyan   

#### Set custom colors by modifying the colors list
- the last color is used when no value is present for any time
- use hexidecimal color codes: https://www.color-hex.com/

In [None]:
proteinID = 'P07339'
colors = ['red', 'yellow', 'blue', 'green', 'cyan']

getPepView(proteinID, data, colors=colors, table=True) #set table=False to hide the table preview

# Get GRAVY Differences

- By default, the program will assume that Table1, Table2, and the FASTA files are in the "data" folder
    - these locations can be customized to specify other paths
- Output will be saved in the same folder location as Table 1
- Alternatively, the same results can be obtained by running gravy_diff.py in the scripts folder
- Table1 and Table2 should have columns named "ProteinID" and "PeptideSequence"

In [None]:
from scripts.gravy_diff import *

#set file names
table1 = "Table 1_Peptides sequence_human urine solution digestion .csv"
table2 = "Table 2_20211204_HUVariousDigestionTime_pr_matrix.csv"
fasta = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta"

#replace "data" with a custom path if the files are located in another folder (in quotations)
table1Path = os.path.join("data",table1)
table2Path = os.path.join("data", table2)
fastaPath = os.path.join("data", fasta)

table1diff = getGRAVYdiffs(fastaPath, table1Path, table2Path)

#append results to table2:
data2 = pd.read_csv(table2Path)
if "ProteinID" not in data2.columns: data2 = data2.rename(columns={"PG.UniProtIds": "ProteinID", "Stripped.Sequence": "PeptideSequence"})
#handling isomers:
table1diff["UPID"] = [i.split(';')[0] for i in table1diff["ProteinID"].values]
data2["UPID"] = [i.split(';')[0] for i in data2["ProteinID"].values]

subset = table1diff.drop_duplicates("UPID")[["UPID", "ProteinSequence", "SequenceGRAVY", "GRAVYdifference", "GRAVYdifference2"]]
table2diff = pd.merge(data2,subset, on=["UPID"], how='left')
table2diff.drop('UPID', axis=1).to_csv(table2Path[:-4]+'_GRAVY.csv', index=False)

# Get Peptide Sequence Percentage
- PeptidePercentage = length of all PeptideSequences / length of full ProteinSequence
- input file should have "ProteinSequence" and "PeptideSequence" with no blank sequences
- if it does not, then it must have columns labeled "ProteinID" and "PeptideSequence", so ProteinSequence can be acquired

In [None]:
from scripts.gravy_diff import *

filename = "labled proteins data set.csv" #set a file name
filepath = os.path.join("data",filename) #replace "data" with a custom path if the files are located in another folder

fasta = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta"
fastaPath = os.path.join("data", fasta)
data = pd.read_csv(filepath)

if "ProteinSequence" not in data.columns or "PeptideSequence" not in data.columns:
    data = process(data, fastaPath)

percentages = {p[0]: sum([len(i) for i in p[1]["PeptideSequence"].values])/len(p[1]["ProteinSequence"].values[0]) for p in data.groupby("ProteinID")}
data['PeptidePercentage'] = data.ProteinID.map(percentages)
data.to_csv(os.path.join("output", filename[:-4])+"_percentage.csv", index=False)

# Get PeptideSequence Distances

#### Set file and Fasta name and run this cell to get peptide distances from their overall center of mass
- Initial dataset should include a "ProteinID" and "PeptideSequence" column
- When completed, the results will be saved as a new csv file in the output folder
- The output only keeps ProteinIDs that have results in Protein Data Bank
- This process takes a long time to complete (some hours)

In [None]:
from protds.peptide_distances import *

#set file names
table1 = "labled proteins data set.csv"
table2 = "NA" #if GRAVYdiff2 is needed, set Table2's name
fasta = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta"

#replace "data" with a custom path if the files are located in another folder (in quotations)
table1Path = os.path.join("data",table1)
table2Path = os.path.join("data", table2)
fastaPath = os.path.join("data", fasta)

data = pd.read_csv(table1Path)
if "PepMid" in data.columns:
    results = pepDistances(data)
    results.to_csv(os.path.join(os.getcwd(),"output",table1[:-4]+'_distances.csv')) #replace "output" with custom path if desired
else:
    from scripts.gravy_diff import *
    df = getGRAVYdiffs(fastaPath, table1Path, table2Path, False)
    results = pepDistances(df)
    results.to_csv(os.path.join(os.getcwd(),"output",table1[:-4]+'_distances.csv')) #replace "output" with custom path if desired

results