# View Structure Subsequences 

#### Set file name of the dataset, and run this cell once to initialize the program.
- Dataset columns should have "ProteinID" and "PeptideSequence"

In [None]:
from scripts.sequence_viewer import *

filename = 'labled proteins data set.csv' 
data = pd.read_csv(os.path.join("data",filename)) 

#### Choose any ProteinID to view selected sequences
- default color order is: red | yellow | blue | green | (repeated if there are more than 4 columns of intensities)
- color is determined by the maximum value amoung those columns
- if no numbers exist for the subsequence (only overnight digestion), it is colored cyan   

#### Set custom colors by modifying the colors list
- the last color is used when no value is present for any time
- use hexidecimal color codes: https://www.color-hex.com/

In [None]:
proteinID = 'P07339'
colors = ['red', 'yellow', 'blue', 'green', 'cyan']

getPepView(proteinID, data, colors=colors, table=True) #set table=False to hide the table preview

# Get GRAVY Differences

- By default, the program will assume that Table1, Table2, and the FASTA files are in the "data" folder
    - these locations can be customized to specify other paths
- Output will be saved in the same folder location as Table 1
- Alternatively, the same results can be obtained by running gravy_diff.py in the scripts folder
- Table1 and Table2 should have columns named "ProteinID" and "PeptideSequence"

In [None]:
from scripts.gravy_diff import *

#set file names
table1 = "Table 1_Peptides sequence_human urine solution digestion .csv"
table2 = "Table 2_20211204_HUVariousDigestionTime_pr_matrix.csv"
fasta = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta"

#replace "data" with a custom path if the files are located in another folder (in quotations)
table1Path = os.path.join("data",table1)
table2Path = os.path.join("data", table2)
fastaPath = os.path.join("data", fasta)

table1diff = getGRAVYdiffs(fastaPath, table1Path, table2Path)

#append results to table2:
data2 = pd.read_csv(table2Path)
if "ProteinID" not in data2.columns: data2 = data2.rename(columns={"PG.UniProtIds": "ProteinID", "Stripped.Sequence": "PeptideSequence"})
#handling isomers:
table1diff["UPID"] = [i.split(';')[0] for i in table1diff["ProteinID"].values]
data2["UPID"] = [i.split(';')[0] for i in data2["ProteinID"].values]

subset = table1diff.drop_duplicates("UPID")[["UPID", "ProteinSequence", "SequenceGRAVY", "GRAVYdifference", "GRAVYdifference2"]]
table2diff = pd.merge(data2,subset, on=["UPID"], how='left')
table2diff.drop('UPID', axis=1).to_csv(table2Path[:-4]+'_GRAVY.csv', index=False)

# Get PeptideSequence Distances

#### Set file and Fasta name and run this cell to get peptide distances from their overall center of mass
- Initial dataset should include a "ProteinID" and "PeptideSequence" column
- When completed, the results will be saved as csv files in the output folder
- The output only keeps ProteinIDs that have results in Protein Data Bank
- This process takes a long time to complete (some hours)

In [None]:
from protds.peptide_distances import *
from scripts.gravy_diff import process, pepPositions, peptidePercentage

filename = "labled proteins data set.csv" #set file name
fastaname = "uniprot-human+taxonomy__Homo+sapiens+(Human)+[9606]_-filtered-revi--.fasta" #set fasta name
#if the files are not in the "data" folder, replace "data" with the full path to that location
fastapath = os.path.join("data", fastaname)
filepath = os.path.join("data", filename)

data = pd.read_csv(filepath)
df = pepPositions(process(data, fastapath))

#Overnight Digestion:
alltimes = pepCenterDist(peptidePercentage(gravyDiff(df)), True) 
alltimes[0].drop("PepMid", axis=1).to_csv(os.path.join("output","Overnight"+filename))
summary = pd.DataFrame(pd.unique(alltimes[0]["ProteinID"]), columns=["ProteinID"])

#Separate Times:
times = ['1st 15min','2nd 15min', '3rd 15min', '4th 15min'] #change to match the data's column names
df = df[~df['ProteinID'].isin(alltimes[1])]
for i in times:
    subset = pepCenterDist(peptidePercentage(gravyDiff(df[df[i].notna()])))
    subset.drop([t for t in times if t!=i]+["PepMid"], axis=1).to_csv(os.path.join("output",i+filename))
    subset.drop_duplicates(subset=['ProteinID'])
    #summary table:
    for col in ['GRAVYdifference', 'PeptidePercentage', 'MeanDistances', 'StdDistances']:
        summary[i+col] = summary.ProteinID.map(dict(zip(subset["ProteinID"], subset[col].values)))
for col in ['GRAVYdifference', 'PeptidePercentage', 'MeanDistances', 'StdDistances']:
    summary["Overnight"+col] = summary.ProteinID.map(dict(zip(alltimes[0]["ProteinID"], alltimes[0][col].values)))
summary.to_csv(os.path.join("output","Summary"+filename))