web media content analysis framework
word list (words.txt) taken from https://github.com/dwyl/english-words, which is based on the word list from http://www.infochimps.com/datasets/word-list-350000-simple-english-words-excel-readable.
- JDK, maven
- Python 3
- RabbitMQ
cd /path/to/coal/repository
mvn compile
cd /path/to/coal/repository
pip install -r python-worker/requirements.txt
sudo rabbitmq-server
cd /path/to/coal/repository
mvn exec java@main
cd /path/to/coal/repository
mvn exec java@downloadworker
cd /path/to/coal/repository
mvn exec java@updateworker
cd /path/to/coal/repository
./python-worker/pdf_author_extraction_worker.py
cd /path/to/coal/repository
./python-worker/pdf_image_extraction_worker.py
cd /path/to/coal/repository
CLARIFAI_APP_ID=<APP-ID> CLARIFAI_APP_SECRET=<APP-SECRET> ./python-worker/clarifai_worker.py
cd /path/to/coal/repository
./python-worker/pdf_metadata_extraction_worker.py
cd /path/to/coal/repository
./python-worker/pdf_text_extraction_worker.py
cd /path/to/coal/repository
./python-worker/pdf_text_formatting_worker.py
cd /path/to/coal/repository
./python-worker/pdf_text_keyword_extraction_worker.py
cd /path/to/coal/repository
./python-worker/pdf_text_langdetect_worker.py
cd /path/to/coal/repository
./python-worker/pdf_text_named_entity_linking_worker.py