# Comparative genomic analysis of *Arothron* species

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import pandas as pd

%matplotlib inline

In [None]:
df = pd.read_csv("Aro_samples.csv")
df[['sample_name', 'sci_name', 'label']]

## Principal component analysis
#### using [PCAngsd](https://github.com/Rosemeis/pcangsd) v0.95 ([Meisner & Albrechtsen 2018](https://doi.org/10.1534/genetics.118.301336))

In [None]:
sns.set_context('talk')
sns.set_style("whitegrid", {'grid.linestyle': '--'})

dfcov = pd.read_table("Aro_PCAngsd_v0.95.cov", header=None)
C = dfcov.values
eigVals, eigVecs = np.linalg.eigh(C)

idx = eigVals.argsort()[::-1]   
eigVals = eigVals[idx]
eigVecs = eigVecs[:,idx]

pc1 = eigVals[0]/eigVals.sum()
pc2 = eigVals[1]/eigVals.sum()
pc3 = eigVals[2]/eigVals.sum()
pc4 = eigVals[3]/eigVals.sum()
pc5 = eigVals[4]/eigVals.sum()

print("pc1:" + str(pc1))
print("pc2:" + str(pc2))
print("pc3:" + str(pc3))
print("pc4:" + str(pc4))
print("pc5:" + str(pc5))

df["PC1"] = eigVecs[:, 0]
df["PC2"] = eigVecs[:, 1]
df["PC3"] = eigVecs[:, 2]
df["PC4"] = eigVecs[:, 3]
df["PC5"] = eigVecs[:, 4]

def pca_plot(pcx, pcy):

    numS1=200
    numS2=250
    numS3=150

    plt.figure(figsize=(5, 5))
    
    for idx, row in df.iterrows():
        sp = row['label']
        label = "$\it{"+row['label']+"}$"
        color=row['color']
        marker=row['marker']
        markersize=row['markersize']
        h_align=row['h_align']
        alpha=row['alpha']
        linewidth=row['linewidth']
        xytext=(row['xx'], row['yy'])
            
        plt_sc = plt.scatter(row[pcx], row[pcy],
                             color=color,
                             marker=marker,
                             s=markersize,
                             linewidth=linewidth,
                             alpha=alpha,
                             label=label)
        plt.annotate(label,
                     xy=(row[pcx], row[pcy]),
                     xytext=xytext,
                     textcoords='offset points',
                     horizontalalignment=h_align,
                     verticalalignment='bottom',
                     alpha=1.0,
                     fontsize=12)

    return True

# PC1, PC2
pca_plot('PC1', 'PC2')

plt.xlabel("PC1 (" + "{0:.1f}".format(pc1*100) + "%)", fontsize=18)
plt.ylabel("PC2 (" + "{0:.1f}".format(pc2*100) + "%)", fontsize=18)
plt.xlim(-0.35, 0.65)
plt.ylim(-0.4, 0.4)
plt.xticks([-0.2, 0.0, 0.2, 0.4, 0.6], fontsize=12)
plt.yticks(fontsize=12)

plt.savefig("Aro_PCAngsd_PC1-PC2.svg", bbox_inches='tight')


# PC2, PC3
pca_plot('PC2', 'PC3')

plt.xlabel("PC2 (" + "{0:.1f}".format(pc2*100) + "%)", fontsize=18)
plt.ylabel("PC3 (" + "{0:.1f}".format(pc3*100) + "%)", fontsize=18)
plt.xlim(-0.4, 0.4)
plt.ylim(-0.3, 1.1)
plt.xticks(fontsize=12)
plt.yticks([-0.2, 0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=12)

plt.savefig("Aro_PCAngsd_PC2-PC3.svg", bbox_inches='tight')

# PC3, PC4
pca_plot('PC3', 'PC4')

plt.xlabel("PC3 (" + "{0:.1f}".format(pc3*100) + "%)", fontsize=18)
plt.ylabel("PC4 (" + "{0:.1f}".format(pc4*100) + "%)", fontsize=18)
plt.xlim(-0.3, 1.1)
plt.ylim(-0.3, 0.75)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.savefig("Aro_PCAngsd_PC3-PC4.svg", bbox_inches='tight')

plt.show()

## Admixture analysis
#### using [NGSAdmix](http://www.popgen.dk/software/index.php/NgsAdmix) v32 ([Skotte et al. 2013](https://doi.org/10.1534/genetics.113.154138)) and [ANGSD](http://www.popgen.dk/angsd/index.php/ANGSD) v0.918 ([Korneliussen et al. 2014](https://doi.org/10.1186/s12859-014-0356-4))

In [None]:
sns.set_context("talk")
sns.set_style("white")

basename = "Aro_NGSadmix"

sns_deep_blue = sns.color_palette('deep').as_hex()[0]
sns_deep_green = sns.color_palette('deep').as_hex()[2]
sns_deep_red = sns.color_palette('deep').as_hex()[3]
sns_deep_lightblue = sns.color_palette('deep').as_hex()[9]
sns_muted_yellow = sns.color_palette('muted').as_hex()[8]

sample_order=['AR0001',
              'AR0002',
              'AR0003',
              'AR0004',
              'AR0005',
              'AR0006',
              'AR0007',
              'AR0008',
              'AR0009',
              'AR0014',
              'AR0015',
              'AR0016',
              'AR0017',
              'AR0010',
              'AR0018',
              'AR0019',
              'AR0020',
              'AR0021',
              'AR0011',
              'AR0012',
              'AR0013']

for i in [3, 4, 5, 6]:
    plt.figure(figsize=(8, 2.5))
    df_k = pd.read_csv(basename+"_K"+str(i)+".qopt",
                       header=None,
                       delim_whitespace=True)
    df_k.columns = list(range(1, i+1))
    df_k['sample_name'] = pd.Categorical(df.sample_name, sample_order)
    df_k['label'] = df.label

    df_k_sort=df_k.sort_values(by='sample_name')
   
    r = np.arange(len(df_k_sort))
    
    if (i==3):
        plt.bar(r, list(df_k_sort[1]), 0.8, linewidth=0, color='orange', align='center')
        plt.bar(r, list(df_k_sort[2]), 0.8, linewidth=0, color=sns_deep_red,
                bottom=list(df_k_sort[1]))
        plt.bar(r, list(df_k_sort[3]), 0.8, linewidth=0, color=sns_deep_green,
                bottom=list(df_k_sort[1]+df_k_sort[2]))
    elif (i==4):
        plt.bar(r, list(df_k_sort[1]), 0.8, linewidth=0, color=sns_deep_blue, align='center')
        plt.bar(r, list(df_k_sort[2]), 0.8, linewidth=0, color='orange',
                bottom=list(df_k_sort[1]))
        plt.bar(r, list(df_k_sort[4]), 0.8, linewidth=0, color=sns_deep_red,
                bottom=list(df_k_sort[1]+df_k_sort[2]))
        plt.bar(r, list(df_k_sort[3]), 0.8, linewidth=0, color=sns_deep_green,
                bottom=list(df_k_sort[1]+df_k_sort[2]+df_k_sort[4]))
    elif (i==5):
        plt.bar(r, list(df_k_sort[3]), 0.8, linewidth=0, color='orange', align='center')
        plt.bar(r, list(df_k_sort[4]), 0.8, linewidth=0, color=sns_deep_lightblue,
                bottom=list(df_k_sort[3]))
        plt.bar(r, list(df_k_sort[2]), 0.8, linewidth=0, color=sns_deep_red,
                bottom=list(df_k_sort[3]+df_k_sort[4]))
        plt.bar(r, list(df_k_sort[1]), 0.8, linewidth=0, color=sns_deep_green,
                bottom=list(df_k_sort[2]+df_k_sort[3]+df_k_sort[4]))
        plt.bar(r, list(df_k_sort[5]), 0.8, linewidth=0, color=sns_deep_blue,
                bottom=list(df_k_sort[1]+df_k_sort[2]+df_k_sort[3]+df_k_sort[4]))
    elif (i==6):
        plt.bar(r, list(df_k_sort[1]), 0.8, linewidth=0, color=sns_muted_yellow, align='center')
        plt.bar(r, list(df_k_sort[6]), 0.8, linewidth=0, color='orange',
                bottom=list(df_k_sort[1]))
        plt.bar(r, list(df_k_sort[3]), 0.8, linewidth=0, color=sns_deep_lightblue,
                bottom=list(df_k_sort[1]+df_k_sort[6]))
        plt.bar(r, list(df_k_sort[4]), 0.8, linewidth=0, color=sns_deep_blue,
                bottom=list(df_k_sort[1]+df_k_sort[6]+df_k_sort[3]))
        plt.bar(r, list(df_k_sort[5]), 0.8, linewidth=0, color=sns_deep_red,
                bottom=list(df_k_sort[1]+df_k_sort[6]+df_k_sort[3]+df_k_sort[4]))
        plt.bar(r, list(df_k_sort[2]), 0.8, linewidth=0, color=sns_deep_green,
                bottom=list(df_k_sort[1]+df_k_sort[6]+df_k_sort[3]+df_k_sort[4]+df_k_sort[5]))
    else:
        plt.bar(r, list(df_k_sort[1]), 0.8)
        bottom = df_k_sort[1]
        for j in np.arange(2, i+1):
            plt.bar(r, list(df_k_sort[j]), 0.8, bottom=list(bottom))
            bottom=bottom+df_k_sort[j]
    
    plt.tick_params(axis='x',
                    which='both',
                    bottom=False,
                    top=False,
                    labelbottom=False)

    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)

    plt.xlim(-0.8, len(df_k_sort)-0.5)
    plt.ylabel("K="+str(i), fontsize=16)
    plt.yticks((0.0, 0.5, 1.0), fontsize=12)

    plt.savefig("Aro_NGSadmix_K"+str(i)+".svg", bbox_inches='tight')

plt.xticks(r, ["$\it{"+label+"}$" for label in df_k_sort['label']], fontsize=16, rotation=90)
plt.tick_params(
    axis='x',
    labelbottom=True)

plt.savefig("Aro_NGSadmix_K6.svg", bbox_inches='tight')

## Hybrid index and heterozygosity

In [None]:
df_het=pd.read_csv("Aro_Het.csv")
df_het

In [None]:
sns.set_context('talk')
sns.set_style('whitegrid')

sns_deep_green=sns.color_palette('deep').as_hex()[2]

fig = plt.figure(figsize=(5, 5))
ax=fig.add_subplot(111)

ax.scatter(df_het['r_ste'][0], df_het['het'][0],
           color='chocolate', marker='x', linewidth=4.0, s=200, zorder=10)
ax.scatter(df_het['r_ste'][1], df_het['het'][1],
           color='chocolate', marker='x', linewidth=4.0, s=200, zorder=10)
ax.scatter(df_het['r_ste'][2], df_het['het'][2],
           color='seagreen', marker='x', linewidth=4.0, s=200, zorder=10)
ax.scatter([0.0], [0.0], color='orange', marker='o', s=250, zorder=10)
ax.scatter([1.0], [0.0], color=sns_deep_green, marker='s', s=250, zorder=10)

t1=mpatches.Polygon([[0.0, 0.0], [0.5, 1.0], [1.0, 0.0]],
                    color='lightblue', linewidth=0, alpha=0.3, zorder=1)
ax.add_patch(t1)

plt.xlim(-0.1, 1.1)
plt.xticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14)
plt.xlabel("Hybrid index", fontsize=18)

plt.ylim(-0.15, 1.15)
plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=14)
plt.ylabel("Heterozygosity", fontsize=18)
ax.set_aspect(0.9)

plt.savefig("Aro_Het.svg", bbox_inches='tight')