In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
df = pd.read_csv('WGS_abstracts.txt', sep = '\t', names = ['Project ID','Title','Abstract','Runs'])
df.set_index('Project ID', inplace=True)

df = df.replace(r'^\s+$', 'None', regex=True)
df = df.replace(r'none provided', 'None', regex=True)
df = df.replace(r'â€˜', '', regex=True)
df = df.replace(r'â€™', '', regex=True)
df = df.replace(r'\.(?!\d)', '', regex=True)
df = df.replace(r',', '', regex=True)
df = df.replace(r'\(', '', regex=True)
df = df.replace(r'\)', '', regex=True)
df = df.replace(np.nan, '')

abstracts = df['Abstract']

In [3]:
cvec = CountVectorizer(stop_words='english', min_df=0.0025, max_df=.5, ngram_range=(1,2))

In [4]:
from itertools import islice
cvec.fit(abstracts)
list(islice(cvec.vocabulary_.items(), 20))

[('isolation', 900),
 ('water', 1791),
 ('biogeochemical', 209),
 ('cycling', 426),
 ('links', 961),
 ('terrestrial', 1680),
 ('marine', 991),
 ('systems', 1654),
 ('biogeochemical cycling', 211),
 ('cycling links', 429),
 ('links terrestrial', 962),
 ('terrestrial marine', 1681),
 ('marine systems', 996),
 ('lake', 922),
 ('multi', 1112),
 ('extreme', 637),
 ('environment', 584),
 ('located', 970),
 ('developing', 483),
 ('red', 1349)]

In [5]:
cvec_counts = cvec.transform(abstracts)
cvec_counts.shape[1]

1819

In [6]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
1067,microbial,1878
316,communities,1182
1494,sequencing,1085
332,community,891
1599,study,876
1025,metagenomic,848
1069,microbial communities,809
1454,samples,770
1543,soil,672
251,carbon,613


In [7]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights.shape

(5385, 1819)

In [8]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
1067,microbial,0.030555
316,communities,0.023838
1494,sequencing,0.021228
1069,microbial communities,0.019564
1025,metagenomic,0.018494
1599,study,0.017496
332,community,0.017208
1543,soil,0.016518
1020,metagenome,0.01648
1454,samples,0.014354


In [9]:
x_train = transformed_weights
mu = x_train.mean(axis=0)
U,s,V = np.linalg.svd(x_train - mu, full_matrices=False)
Zpca = np.dot(x_train - mu, V.transpose())

Rpca = np.dot(Zpca[:,:2], V[:2,:]) + mu    # reconstruction

In [10]:
err = np.sum(np.asarray(x_train-Rpca)**2)/Rpca.shape[0]/Rpca.shape[1]
print(err)

0.0003608945813434699


In [None]:
plt.scatter([Zpca[:2000,0]], [Zpca[:2000,1]])
plt.show()