In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
df = pd.read_csv('WGS_abstracts.txt', sep = '\t', names = ['Project ID','Title','Abstract','Runs'])
df.set_index('Project ID', inplace=True)

df = df.replace(r'^\s+$', 'None', regex=True)
df = df.replace(r'none provided', 'None', regex=True)
df = df.replace(r'â€˜', '', regex=True)
df = df.replace(r'â€™', '', regex=True)
df = df.replace(r'\.(?!\d)', '', regex=True)
df = df.replace(r',', '', regex=True)
df = df.replace(r'\(', '', regex=True)
df = df.replace(r'\)', '', regex=True)
df = df.replace(np.nan, '')
print(df)

abstracts = df['Abstract']

                                                        Title  \
Project ID                                                      
SRP099026                sludge metagenome Raw sequence reads   
SRP107999   Aqueous microbial communities from the Delawar...   
SRP080517   Grasslands soil microbial communities from the...   
SRP007817   Hoatzin crop microbiome epithelium fraction 12...   
ERP021394   Environmental alkali multi-extreme Diamante La...   
SRP096603   Active sludge microbial communities of municip...   
ERP006694                                      test sequences   
SRP100041   Marine microbial communities from the Southern...   
SRP006444   Community genomic analysis of an extremely aci...   
SRP104952   Marine viral communities from the Global Malas...   
SRP041163                  Bovine rumen microbiome Metagenome   
SRP099977   Freshwater microbial communities from Crystal ...   
SRP059473     Dechlorination Culture CG-3 and SG-1 Metagenome   
SRP081889   Freshwater mi

In [3]:
cvec = CountVectorizer(stop_words='english', min_df=1, max_df=.25, ngram_range=(1,2))

In [4]:
from itertools import islice
cvec.fit(abstracts)
list(islice(cvec.vocabulary_.items(), 20))

[('isolation', 993),
 ('phages', 1355),
 ('water', 1967),
 ('biogeochemical', 239),
 ('cycling', 474),
 ('links', 1064),
 ('terrestrial', 1846),
 ('marine', 1099),
 ('systems', 1819),
 ('biogeochemical cycling', 241),
 ('cycling links', 477),
 ('links terrestrial', 1065),
 ('terrestrial marine', 1847),
 ('marine systems', 1104),
 ('lake', 1024),
 ('multi', 1231),
 ('extreme', 703),
 ('environment', 649),
 ('located', 1073),
 ('developing', 535)]

In [5]:
cvec_counts = cvec.transform(abstracts)
cvec_counts.shape

(5385, 2000)

In [6]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
359,communities,1182
1650,sequencing,1085
374,community,891
1762,study,876
1136,metagenomic,848
1180,microbial communities,809
1605,samples,770
1702,soil,672
284,carbon,613
569,dna,609


In [None]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)

In [13]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
359,communities,0.024028
1650,sequencing,0.021107
1180,microbial communities,0.019875
1136,metagenomic,0.018313
1762,study,0.017185
374,community,0.017013
1702,soil,0.016432
1131,metagenome,0.016332
1605,samples,0.013851
128,analysis,0.013447


In [None]:
x_train = transformed_weights
mu = x_train.mean(axis=0)
U,s,V = np.linalg.svd(x_train - mu, full_matrices=False)
Zpca = np.dot(x_train - mu, V.transpose())

Rpca = np.dot(Zpca[:,:2], V[:2,:]) + mu    # reconstruction

In [None]:
err = np.sum(np.asarray(x_train-Rpca)**2)/Rpca.shape[0]/Rpca.shape[1]
print(err)

In [None]:
plt.scatter([Zpca[:2000,0]], [Zpca[:2000,1]])
plt.show()