# Alignment

## Setup

In [None]:
#| default_exp align

In [None]:
#| export
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO

import subprocess
from pathlib import Path
import pandas as pd

## Alignment

In [None]:
#| export
def get_fasta(df,seq_col='kd_seq',id_col='kd_ID',path='out.fasta'):
    "Generate fasta file from sequences."
    records = [
        SeqRecord(Seq(str(row[seq_col])), id=str(row[id_col]), description="")
        for _, row in df.iterrows()
    ]
    SeqIO.write(records, path, "fasta")
    print(len(records))

```python
get_fasta(kd,seq_col='kd_seq',id_col='kd_ID',path='raw/kinase_domains.fasta')
```

To run clustalo alignment, can run either through terminal or the function

```bash
sudo apt-get update
sudo apt-get install clustalo
clustalo -i kinase_domains.fasta -o kinase_domains.aln --force --outfmt=clu
```

In [None]:
#| export
def run_clustalo(input_fasta,  # .fasta fname
                 output_aln, # .aln output fname
                 outfmt="clu"):
    "Run Clustal Omega to perform multiple sequence alignment."
    # if the output directory does not exist, create one
    output_aln = Path(output_aln)
    output_aln.parent.mkdir(parents=True, exist_ok=True)

    # run clustalo
    subprocess.run([
        "clustalo", "-i", str(input_fasta),
        "-o", str(output_aln),
        "--force", "--outfmt=clu"
    ], check=True)

```python
run_clustalo("kinase_domains.fasta", "raw/kinase_domains.aln")
```

In [None]:
#| export
def aln2df(fname):
    alignment = AlignIO.read(fname, "clustal")
    alignment_array = [list(str(record.seq)) for record in alignment]
    ids = [record.id for record in alignment]
    df = pd.DataFrame(alignment_array, index=ids)
    df.columns = df.columns+1
    return df

```python
df = aln2df("raw/kinase_domains.aln")
```

In [None]:
#| export
def get_aln_freq(df):
    "Get frequency of each amino acid across each position from the aln2df output."
    counts_df = df.apply(lambda col: col.value_counts(), axis=0).fillna(0)
    return counts_df.div(counts_df.sum(axis=0), axis=1)

```python
freq_df = get_aln_freq(df)
```

## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()