# View Structure Subsequences 

#### Set file name of the dataset, and run this cell once to initialize the program.
- Dataset columns should have "ProteinID" and "PeptideSequence"

In [None]:
from scripts.sequence_viewer import *

filename = 'labled proteins data set.csv' 
data = pd.read_csv(os.path.join("data",filename)) 

#### Choose any ProteinID to view selected sequences
- default color order is: red | yellow | blue | green | (repeated if there are more than 4 columns of intensities)
- color is determined by the maximum value amoung those columns
- if no numbers exist for the subsequence (only overnight digestion), it is colored cyan   

#### Set custom colors by modifying the colors list
- the last color is used when no value is present for any time
- use hexidecimal color codes: https://www.color-hex.com/

In [None]:
proteinID = 'P07339'
colors = ['red', 'yellow', 'blue', 'green', 'cyan']

getPepView(proteinID, data, colors=colors, table=True) #set table=False to hide the table preview

# Get GRAVY Differences

- By default, the program will assume that Table1, Table2, and the FASTA files are in the "data" folder
    - these locations can be customized to specify other paths
- Output will be saved in the same folder location as Table 1
- Alternatively, the same results can be obtained by running gravy_diff.py in the scripts folder
- Table1 and Table2 should have columns named "ProteinID" and "PeptideSequence"

In [None]:
from scripts.gravy_diff import *

#set file names
table1 = "Table 1_Peptides sequence_human urine solution digestion .csv"
table2 = "Table 2_20211204_HUVariousDigestionTime_pr_matrix.csv"
fasta = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta"

#replace "data" with a custom path if the files are located in another folder (in quotations)
table1Path = os.path.join("data",table1)
table2Path = os.path.join("data", table2)
fastaPath = os.path.join("data", fasta)

table1diff = getGRAVYdiffs(fastaPath, table1Path, table2Path)

#append results to table2:
data2 = pd.read_csv(table2Path)
if "ProteinID" not in data2.columns: data2 = data2.rename(columns={"PG.UniProtIds": "ProteinID", "Stripped.Sequence": "PeptideSequence"})
#handling isomers:
table1diff["UPID"] = [i.split(';')[0] for i in table1diff["ProteinID"].values]
data2["UPID"] = [i.split(';')[0] for i in data2["ProteinID"].values]

subset = table1diff.drop_duplicates("UPID")[["UPID", "ProteinSequence", "SequenceGRAVY", "GRAVYdifference", "GRAVYdifference2"]]
table2diff = pd.merge(data2,subset, on=["UPID"], how='left')
table2diff.drop('UPID', axis=1).to_csv(table2Path[:-4]+'_GRAVY.csv', index=False)

# Get Peptide Sequence Percentage

In [None]:
from scripts.gravy_diff import *

#set file names:
filename = "labled proteins data set.csv" 
fasta = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta"

#replace "data" with a custom path if the files are located in another folder
filepath = os.path.join("data",filename) 
fastaPath = os.path.join("data", fasta)

data = pd.read_csv(filepath)
if "ProteinSequence" not in data.columns or "PeptideSequence" not in data.columns:
    data = process(data, fastaPath)
data = peptidePercentage(data)
data.to_csv(os.path.join("output", filename[:-4])+"_Percentage.csv", index=False)

# Get PeptideSequence Distances

#### Set file and Fasta name and run this cell to get peptide distances from their overall center of mass
- Initial dataset should include a "ProteinID" and "PeptideSequence" column
- When completed, the results will be saved as csv files in the output folder
- The output only keeps ProteinIDs that have results in Protein Data Bank
- This process takes a long time to complete (some hours)

In [None]:
from protds.peptide_distances import *
from scripts.gravy_diff import process, pepPositions, peptidePercentage

filename = "labled proteins data set.csv" #set file name
fastaname = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta" #set fasta name

#if the files are not in the "data" folder, replace "data" with the full path to that location
fastapath = os.path.join("data", fastaname)
filepath = os.path.join("data", filename)

df = pepPositions(process(pd.read_csv(filepath), fastapath))
times = ['1st 15min','2nd 15min', '3rd 15min', '4th 15min', 'OvernightDigestion']#change to match the data's column names
try:
    getCenterDistByTime(filename, df, times)
except TypeError:
    invalid = [i[0] for i in proteins.items() if i[1].structures==None]
    for i in invalid: del proteins[i]
    getCenterDistByTime(filename, df, times)

# Plane 1:
"Use the sequence in table 2 to define plane 1, calculate the distance of each sequenced peptide to this plane. (The sequence with the highest intensity at 1st 15min should on this plane.) ... the distance of the other sequences to this plane should be minimum.  "

In [None]:
from protds.peptide_distances import *
from scripts.gravy_diff import process, pepPositions, filterSequences

#set names for the data and fasta files
filename = "Table 2_20211204_HUVariousDigestionTime_pr_matrix.csv"
fastaname = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta"

#replace "data" with a custom path if the files are located in another folder (in quotations)
filepath = os.path.join("data", filename)
fastapath = os.path.join("data", fastaname)

#results:
data2 = process(pd.read_csv(filepath), fastapath, "PG.UniProtIds", "Stripped.Sequence")
filtered = filterSequences(data2)
table2 = pepPositions(filtered[0])
try:
    planeDists = pepPlaneDist(table2, True).rename(columns={"DistanceToPlane": "DistanceToPlane1"})
except TypeError:
    invalid = [i[0] for i in proteins.items() if i[1].structures==None]
    for i in invalid: del proteins[i]
    planeDists = pepPlaneDist(table2, True).rename(columns={"DistanceToPlane": "DistanceToPlane1"})

planeDists.drop("PepMid", axis=1).to_csv(os.path.join("output", filename[:-4]+'_Plane1.csv'))
filtered[1].to_csv(os.path.join("filtered", filename[:-4]+'_Plane1_Excluded.csv'))

### Angle between Plane1 and Plane2

In [None]:
table1 = "Table 1_Peptides sequence_human urine solution digestion .csv" #used for plane2
table1Path = os.path.join("data", table1)
    
#table1-table2 sequences:
data1 = pepPositions(filterSequences(process(pd.read_csv(table1Path), fastapath))[0])
data1["UPID"] = [i.split(';')[0] for i in data1["ProteinID"].values]
data2["UPID"] = [i.split(';')[0] for i in data2["ProteinID"].values]
data1Minus2 = pd.merge(data1, data2, on=["UPID", "PeptideSequence"], how="outer", suffixes=('','_2'), indicator=True).query('_merge=="left_only"')[data1.columns]

try:
    angles = getAngles(planeDists, data1Minus2)
except TypeError:
    invalid = [i[0] for i in proteins.items() if i[1].structures==None]
    for i in invalid: del proteins[i]
    angles = getAngles(planeDists, data1Minus2)
    
angles.drop("PepMid", axis=1).to_csv(os.path.join("output", filename[:-4]+'_Plane1.csv'))

# Plane 2: 
"For each protein in table 1 (human urine solution digestion), after removing the peptides sequences found in table 2, use the left sequences to define plane 2, calculate the distance of each sequenced peptide to this plane. "

In [None]:
from protds.peptide_distances import *
from scripts.gravy_diff import process, pepPositions, filterSequences

#set file names
table1 = "Table 1_Peptides sequence_human urine solution digestion .csv"
table2 = "Table 2_20211204_HUVariousDigestionTime_pr_matrix.csv"
fasta = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta"

#replace "data" with a custom path if the files are located in another folder (in quotations)
table1Path = os.path.join("data",table1)
table2Path = os.path.join("data", table2)
fastaPath = os.path.join("data", fasta)

#table1-table2 sequences:
filtered1 = filterSequences(process(pd.read_csv(table1Path), fastaPath))
data1 = pepPositions(filtered1[0])
data2 = pd.read_csv(table2Path) 
if "ProteinID" not in data2.columns: data2 = data2.rename(columns={"PG.UniProtIds": "ProteinID", "Stripped.Sequence": "PeptideSequence"})
data1["UPID"] = [i.split(';')[0] for i in data1["ProteinID"].values]
data2["UPID"] = [i.split(';')[0] for i in data2["ProteinID"].values]

df = pd.merge(data1, data2, on=["UPID", "PeptideSequence"], how="outer", suffixes=('','_2'), indicator=True).query('_merge=="left_only"')[data1.columns]
try:
    diffdist = pepPlaneDist(df, False).drop(["PepMid", "UPID"], axis=1).rename(columns={"DistanceToPlane": "DistanceToPlane2"})
except TypeError:
    invalid = [i[0] for i in proteins.items() if i[1].structures==None]
    for i in invalid: del proteins[i]
    diffdist = pepPlaneDist(df, False).drop(["PepMid", "UPID"], axis=1).rename(columns={"DistanceToPlane": "DistanceToPlane2"})

diffdist.to_csv(os.path.join("output", table1[:-4]+'_Plane2.csv'))
filtered1[1].to_csv(os.path.join("filtered", table1[:-4]+'_Plane2_Excluded.csv'))