Skip to content

Commit

Permalink
droplet deploy changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob authored and Rob committed Nov 16, 2015
1 parent 90b0ad9 commit d829159
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 11 deletions.
1 change: 1 addition & 0 deletions models/.gitignore
@@ -0,0 +1 @@
*
4 changes: 1 addition & 3 deletions scripts/setup_venv.sh
@@ -1,6 +1,4 @@
sudo apt-get -y install python3.4-dev libicu-dev mysql-client libmysqlclient-dev
sudo apt-get -y install build-essential python3-dev libatlas-dev libatlas3gf-base
sudo apt-get -y install gfortran libopenblas-dev liblapack-dev libhdf5-dev
sudo apt-get -y install python3.4-dev libicu-dev build-essential libatlas-dev libatlas3gf-base gfortran libopenblas-dev liblapack-dev

python3 -m venv venv/ --without-pip;
source venv/bin/activate;
Expand Down
52 changes: 44 additions & 8 deletions src/mdsd2w2v.py
Expand Up @@ -7,6 +7,7 @@
from numpy.linalg import norm
from scipy.spatial.distance import cosine
import spacy.en
from sklearn.externals import joblib
nlp = spacy.en.English(load_vectors=False, entity=False)

#### Corpus Generators ####
Expand All @@ -28,24 +29,59 @@ def tokenize_doc_map(doc):
return doc


def mdsd_corpus(mdsd_path):
p = mp.Pool(2)
def mdsd_corpus(mdsd_path, p=None):
if p == None:
p = mp.Pool(mp.cpu_count())
mdsd_files = os.listdir(mdsd_path)
for mdsd_file in mdsd_files:
logging.info('processing {}'.format(mdsd_file))
f = open(mdsd_path+mdsd_file, 'rb')
for doc in p.imap(tokenize_doc_map, f.readlines()):
for doc in p.imap_unordered(tokenize_doc_map, f.readlines()):
for sent in doc['toks']:
yield [w.lower() for w in sent]


def identity(x):
return x


if __name__ == '__main__':
w2v = False
lsi = True
tsne = False

mdsd_path = 'data/Multi_Domain_Sentiment_Dataset/json_files/'
w2v_model_path = 'models/mdsd_w2v_model.txt'
logging.basicConfig(filename='word2vec_training.log',
level=logging.INFO,
format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%m/%d/%Y %H:%M:%S')
model_w2v = Word2Vec(min_count=5, workers=2)
model_w2v.build_vocab(mdsd_corpus(mdsd_path))
model_w2v.train(mdsd_corpus(mdsd_path))
model_w2v.save(w2v_model_path)

if w2v:
w2v_model_path = 'models/mdsd_w2v_model.txt'
model_w2v = Word2Vec(min_count=3, workers=mp.cpu_count())
model_w2v.build_vocab(mdsd_corpus(mdsd_path))
model_w2v.train(mdsd_corpus(mdsd_path))
model_w2v.save(w2v_model_path)
joblib.dump(model_w2v.syn0norm, 'w2v_syn0norm.np')

if lsi:
p = mp.Pool(mp.cpu_count())
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
lsi = TruncatedSVD(n_components=400)
vec = TfidfVectorizer(tokenizer=identity,
preprocessor=identity,
ngram_range=(1,5), min_df=3)
term_doc_m = vec.fit(mdsd_corpus(mdsd_path, p)).T
p.terminate()
term_doc_m_lsi = lsi.fit(term_doc_m)
joblib.dump(term_doc_m_lsi, 'term_doc_m_lsi')
joblib.dump(term_doc_m, 'term_doc_m')
joblib.dump(vec, 'lsi_vec')
joblib.dump(lsi, 'lsi_model')

if tsne:
from sklearn.manifold import TSNE
from sklearn.externals import joblib
tsne = TSNE(n_components=2)
syn0norm = joblib.load('w2v_syn0norm.np')

0 comments on commit d829159

Please sign in to comment.