diff --git a/.circleci/config.yml b/.circleci/config.yml index 0ae446e..e341f0e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,28 +1,37 @@ version: 2 jobs: - build: - machine: - image: ubuntu-1604:201903-01 - docker_layer_caching: true + test: + docker: + - image: preethamkamidi/verifytweet-base:latest steps: - checkout + - restore_cache: + key: v1-py-cache-{{ .Branch }}-{{ checksum "requirements-dev.txt" }}-{{ checksum "requirements.txt" }} - run: - name: Install Dependencies + name: Setup venv and install requirements command: | - echo 'export TAG=$(grep -oE "\"(.*?)\"" verifytweet/__init__.py)' >> $BASH_ENV - echo 'export TAG=${TAG:1:5}' >> $BASH_ENV - echo 'export IMAGE_NAME=verifytweet' >> $BASH_ENV + python3 -m venv ~/.venv + echo ". ~/.venv/bin/activate" >> $BASH_ENV + source $BASH_ENV + pip install -r requirements.txt -r requirements-dev.txt + - save_cache: + name: Save Python dependencies cache + key: v1-py-cache-{{ .Branch }}-{{ checksum "requirements-dev.txt" }}-{{ checksum "requirements.txt" }} + paths: + - ~/.venv - run: - name: Build and push Docker image - command: | - docker build -t preethamkamidi/$IMAGE_NAME:$TAG . - echo $DOCKER_PWD | docker login -u $DOCKER_USER --password-stdin - docker push preethamkamidi/$IMAGE_NAME:$TAG + name: Run tests + command: pytest + workflows: version: 2 - build_and_push: + build_and_test: jobs: - - build: + - test: filters: tags: only: /^v.*/ + branches: + only: + - master + - develop diff --git a/Pipfile b/Pipfile index ea149f9..4b04c7d 100644 --- a/Pipfile +++ b/Pipfile @@ -9,6 +9,10 @@ yapf = "*" sphinx = "*" pytest = "*" twine = "*" +bandit = "*" +hypothesis = "*" +coverage = "*" +pytest-cov = "*" [packages] certifi = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 48b5dee..b9498ce 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "b2689adf5ee4eeef3aa7a0642af1af465111e5286ea5721d236e971d476984fc" + "sha256": "903fdfd11ad5a79834909d89a568183647da3179b13bf03ab550b2425e4513ed" }, "pipfile-spec": 6, "requires": { @@ -211,11 +211,11 @@ }, "flask": { "hashes": [ - "sha256:ad7c6d841e64296b962296c2c2dabc6543752985727af86a975072dea984b6f3", - "sha256:e7d32475d1de5facaa55e3958bc4ec66d3762076b074296aa50ef8fdc5b9df61" + "sha256:a31adc27de06034c657a8dc091cc5fcb0227f2474798409bff0e9674de31a026", + "sha256:b5ae63812021cb04174fcff05d560a98387a44d9cccd4652a2bfa131ba4e4c9b" ], "index": "pypi", - "version": "==1.0.3" + "version": "==1.1.0" }, "flask-cors": { "hashes": [ @@ -437,10 +437,10 @@ }, "nltk": { "hashes": [ - "sha256:12d7129aea0972840419499411d3aa815c6ad66336a51131e120d35a25d953b2" + "sha256:764c20a5f8532a681c261af3c7d1a54768a35df6f3603df75e615cbd34e47cb5" ], "index": "pypi", - "version": "==3.4.3" + "version": "==3.4.4" }, "numpy": { "hashes": [ @@ -498,35 +498,35 @@ }, "pillow": { "hashes": [ - "sha256:15c056bfa284c30a7f265a41ac4cbbc93bdbfc0dfe0613b9cb8a8581b51a9e55", - "sha256:1a4e06ba4f74494ea0c58c24de2bb752818e9d504474ec95b0aa94f6b0a7e479", - "sha256:1c3c707c76be43c9e99cb7e3d5f1bee1c8e5be8b8a2a5eeee665efbf8ddde91a", - "sha256:1fd0b290203e3b0882d9605d807b03c0f47e3440f97824586c173eca0aadd99d", - "sha256:24114e4a6e1870c5a24b1da8f60d0ba77a0b4027907860188ea82bd3508c80eb", - "sha256:258d886a49b6b058cd7abb0ab4b2b85ce78669a857398e83e8b8e28b317b5abb", - "sha256:33c79b6dd6bc7f65079ab9ca5bebffb5f5d1141c689c9c6a7855776d1b09b7e8", - "sha256:367385fc797b2c31564c427430c7a8630db1a00bd040555dfc1d5c52e39fcd72", - "sha256:3c1884ff078fb8bf5f63d7d86921838b82ed4a7d0c027add773c2f38b3168754", - "sha256:44e5240e8f4f8861d748f2a58b3f04daadab5e22bfec896bf5434745f788f33f", - "sha256:46aa988e15f3ea72dddd81afe3839437b755fffddb5e173886f11460be909dce", - "sha256:74d90d499c9c736d52dd6d9b7221af5665b9c04f1767e35f5dd8694324bd4601", - "sha256:809c0a2ce9032cbcd7b5313f71af4bdc5c8c771cb86eb7559afd954cab82ebb5", - "sha256:85d1ef2cdafd5507c4221d201aaf62fc9276f8b0f71bd3933363e62a33abc734", - "sha256:8c3889c7681af77ecfa4431cd42a2885d093ecb811e81fbe5e203abc07e0995b", - "sha256:9218d81b9fca98d2c47d35d688a0cea0c42fd473159dfd5612dcb0483c63e40b", - "sha256:9aa4f3827992288edd37c9df345783a69ef58bd20cc02e64b36e44bcd157bbf1", - "sha256:9d80f44137a70b6f84c750d11019a3419f409c944526a95219bea0ac31f4dd91", - "sha256:b7ebd36128a2fe93991293f997e44be9286503c7530ace6a55b938b20be288d8", - "sha256:c4c78e2c71c257c136cdd43869fd3d5e34fc2162dc22e4a5406b0ebe86958239", - "sha256:c6a842537f887be1fe115d8abb5daa9bc8cc124e455ff995830cc785624a97af", - "sha256:cf0a2e040fdf5a6d95f4c286c6ef1df6b36c218b528c8a9158ec2452a804b9b8", - "sha256:cfd28aad6fc61f7a5d4ee556a997dc6e5555d9381d1390c00ecaf984d57e4232", - "sha256:dca5660e25932771460d4688ccbb515677caaf8595f3f3240ec16c117deff89a", - "sha256:de7aedc85918c2f887886442e50f52c1b93545606317956d65f342bd81cb4fc3", - "sha256:e6c0bbf8e277b74196e3140c35f9a1ae3eafd818f7f2d3a15819c49135d6c062" + "sha256:0804f77cb1e9b6dbd37601cee11283bba39a8d44b9ddb053400c58e0c0d7d9de", + "sha256:0ab7c5b5d04691bcbd570658667dd1e21ca311c62dcfd315ad2255b1cd37f64f", + "sha256:0b3e6cf3ea1f8cecd625f1420b931c83ce74f00c29a0ff1ce4385f99900ac7c4", + "sha256:365c06a45712cd723ec16fa4ceb32ce46ad201eb7bbf6d3c16b063c72b61a3ed", + "sha256:38301fbc0af865baa4752ddae1bb3cbb24b3d8f221bf2850aad96b243306fa03", + "sha256:3aef1af1a91798536bbab35d70d35750bd2884f0832c88aeb2499aa2d1ed4992", + "sha256:3fe0ab49537d9330c9bba7f16a5f8b02da615b5c809cdf7124f356a0f182eccd", + "sha256:45a619d5c1915957449264c81c008934452e3fd3604e36809212300b2a4dab68", + "sha256:49f90f147883a0c3778fd29d3eb169d56416f25758d0f66775db9184debc8010", + "sha256:571b5a758baf1cb6a04233fb23d6cf1ca60b31f9f641b1700bfaab1194020555", + "sha256:5ac381e8b1259925287ccc5a87d9cf6322a2dc88ae28a97fe3e196385288413f", + "sha256:6153db744a743c0c8c91b8e3b9d40e0b13a5d31dbf8a12748c6d9bfd3ddc01ad", + "sha256:6fd63afd14a16f5d6b408f623cc2142917a1f92855f0df997e09a49f0341be8a", + "sha256:70acbcaba2a638923c2d337e0edea210505708d7859b87c2bd81e8f9902ae826", + "sha256:70b1594d56ed32d56ed21a7fbb2a5c6fd7446cdb7b21e749c9791eac3a64d9e4", + "sha256:76638865c83b1bb33bcac2a61ce4d13c17dba2204969dedb9ab60ef62bede686", + "sha256:7b2ec162c87fc496aa568258ac88631a2ce0acfe681a9af40842fc55deaedc99", + "sha256:7cee2cef07c8d76894ebefc54e4bb707dfc7f258ad155bd61d87f6cd487a70ff", + "sha256:7d16d4498f8b374fc625c4037742fbdd7f9ac383fd50b06f4df00c81ef60e829", + "sha256:b50bc1780681b127e28f0075dfb81d6135c3a293e0c1d0211133c75e2179b6c0", + "sha256:bd0582f831ad5bcad6ca001deba4568573a4675437db17c4031939156ff339fa", + "sha256:cfd40d8a4b59f7567620410f966bb1f32dc555b2b19f82a91b147fac296f645c", + "sha256:e3ae410089de680e8f84c68b755b42bc42c0ceb8c03dbea88a5099747091d38e", + "sha256:e9046e559c299b395b39ac7dbf16005308821c2f24a63cae2ab173bd6aa11616", + "sha256:ef6be704ae2bc8ad0ebc5cb850ee9139493b0fc4e81abcc240fb392a63ebc808", + "sha256:f8dc19d92896558f9c4317ee365729ead9d7bbcf2052a9a19a3ef17abbb8ac5b" ], "index": "pypi", - "version": "==6.0.0" + "version": "==6.1.0" }, "pycares": { "hashes": [ @@ -725,7 +725,7 @@ "twint": { "editable": true, "git": "https://github.com/twintproject/twint.git", - "ref": "c5c6f1d60554cd0ee64ba223850b070553a17e74" + "ref": "ad27650fbc0bf8c3f2c78449088a5ede7239f53a" }, "typing": { "hashes": [ @@ -822,6 +822,14 @@ ], "version": "==2.7.0" }, + "bandit": { + "hashes": [ + "sha256:336620e220cf2d3115877685e264477ff9d9abaeb0afe3dc7264f55fa17a3952", + "sha256:41e75315853507aa145d62a78a2a6c5e3240fe14ee7c601459d0df9418196065" + ], + "index": "pypi", + "version": "==1.6.2" + }, "bleach": { "hashes": [ "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16", @@ -845,6 +853,43 @@ "index": "pypi", "version": "==3.0.4" }, + "coverage": { + "hashes": [ + "sha256:3684fabf6b87a369017756b551cef29e505cb155ddb892a7a29277b978da88b9", + "sha256:39e088da9b284f1bd17c750ac672103779f7954ce6125fd4382134ac8d152d74", + "sha256:3c205bc11cc4fcc57b761c2da73b9b72a59f8d5ca89979afb0c1c6f9e53c7390", + "sha256:465ce53a8c0f3a7950dfb836438442f833cf6663d407f37d8c52fe7b6e56d7e8", + "sha256:48020e343fc40f72a442c8a1334284620f81295256a6b6ca6d8aa1350c763bbe", + "sha256:5296fc86ab612ec12394565c500b412a43b328b3907c0d14358950d06fd83baf", + "sha256:5f61bed2f7d9b6a9ab935150a6b23d7f84b8055524e7be7715b6513f3328138e", + "sha256:68a43a9f9f83693ce0414d17e019daee7ab3f7113a70c79a3dd4c2f704e4d741", + "sha256:6b8033d47fe22506856fe450470ccb1d8ba1ffb8463494a15cfc96392a288c09", + "sha256:7ad7536066b28863e5835e8cfeaa794b7fe352d99a8cded9f43d1161be8e9fbd", + "sha256:7bacb89ccf4bedb30b277e96e4cc68cd1369ca6841bde7b005191b54d3dd1034", + "sha256:839dc7c36501254e14331bcb98b27002aa415e4af7ea039d9009409b9d2d5420", + "sha256:8f9a95b66969cdea53ec992ecea5406c5bd99c9221f539bca1e8406b200ae98c", + "sha256:932c03d2d565f75961ba1d3cec41ddde00e162c5b46d03f7423edcb807734eab", + "sha256:988529edadc49039d205e0aa6ce049c5ccda4acb2d6c3c5c550c17e8c02c05ba", + "sha256:998d7e73548fe395eeb294495a04d38942edb66d1fa61eb70418871bc621227e", + "sha256:9de60893fb447d1e797f6bf08fdf0dbcda0c1e34c1b06c92bd3a363c0ea8c609", + "sha256:9e80d45d0c7fcee54e22771db7f1b0b126fb4a6c0a2e5afa72f66827207ff2f2", + "sha256:a545a3dfe5082dc8e8c3eb7f8a2cf4f2870902ff1860bd99b6198cfd1f9d1f49", + "sha256:a5d8f29e5ec661143621a8f4de51adfb300d7a476224156a39a392254f70687b", + "sha256:aca06bfba4759bbdb09bf52ebb15ae20268ee1f6747417837926fae990ebc41d", + "sha256:bb23b7a6fd666e551a3094ab896a57809e010059540ad20acbeec03a154224ce", + "sha256:bfd1d0ae7e292105f29d7deaa9d8f2916ed8553ab9d5f39ec65bcf5deadff3f9", + "sha256:c62ca0a38958f541a73cf86acdab020c2091631c137bd359c4f5bddde7b75fd4", + "sha256:c709d8bda72cf4cd348ccec2a4881f2c5848fd72903c185f363d361b2737f773", + "sha256:c968a6aa7e0b56ecbd28531ddf439c2ec103610d3e2bf3b75b813304f8cb7723", + "sha256:df785d8cb80539d0b55fd47183264b7002077859028dfe3070cf6359bf8b2d9c", + "sha256:f406628ca51e0ae90ae76ea8398677a921b36f0bd71aab2099dfed08abd0322f", + "sha256:f46087bbd95ebae244a0eda01a618aff11ec7a069b15a3ef8f6b520db523dcf1", + "sha256:f8019c5279eb32360ca03e9fac40a12667715546eed5c5eb59eb381f2f501260", + "sha256:fc5f4d209733750afd2714e9109816a29500718b32dd9a5db01c0cb3a019b96a" + ], + "index": "pypi", + "version": "==4.5.3" + }, "docutils": { "hashes": [ "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", @@ -853,6 +898,28 @@ ], "version": "==0.14" }, + "gitdb2": { + "hashes": [ + "sha256:83361131a1836661a155172932a13c08bda2db3674e4caa32368aa6eb02f38c2", + "sha256:e3a0141c5f2a3f635c7209d56c496ebe1ad35da82fe4d3ec4aaa36278d70648a" + ], + "version": "==2.0.5" + }, + "gitpython": { + "hashes": [ + "sha256:563221e5a44369c6b79172f455584c9ebbb122a13368cc82cb4b5addff788f82", + "sha256:8237dc5bfd6f1366abeee5624111b9d6879393d84745a507de0fda86043b65a8" + ], + "version": "==2.1.11" + }, + "hypothesis": { + "hashes": [ + "sha256:936cdfd8c4db60c0d86bd57c9381e59c3c2b73bc00796f13d2e29af71513d77c", + "sha256:ad2797130be83ff374c1ed2781fb591b4152ae28abda28dd57b2a84a3fc1f5d4" + ], + "index": "pypi", + "version": "==4.26.4" + }, "idna": { "hashes": [ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", @@ -931,6 +998,13 @@ ], "version": "==19.0" }, + "pbr": { + "hashes": [ + "sha256:36ebd78196e8c9588c972f5571230a059ff83783fabbbbedecc07be263ccd7e6", + "sha256:5a03f59455ad54f01a94c15829b8b70065462b7bd8d5d7e983306b59127fc841" + ], + "version": "==5.4.0" + }, "pkginfo": { "hashes": [ "sha256:7424f2c8511c186cd5424bbf31045b77435b37a8d604990b79d4e70d741148bb", @@ -977,11 +1051,19 @@ }, "pytest": { "hashes": [ - "sha256:2878de8ae1c79a62c012da6186b88ff0562ea96ce29c4208d2a9b11d9f607df1", - "sha256:95b700cf21ed5b7e91bce7a6b5a573b2e3ef7b3643d00f681d8f9c4672f9fbdf" + "sha256:6ef6d06de77ce2961156013e9dff62f1b2688aa04d0dc244299fe7d67e09370d", + "sha256:a736fed91c12681a7b34617c8fcefe39ea04599ca72c608751c31d89579a3f77" ], "index": "pypi", - "version": "==5.0.0" + "version": "==5.0.1" + }, + "pytest-cov": { + "hashes": [ + "sha256:2b097cde81a302e1047331b48cadacf23577e431b61e9c6f49a1170bbe3d3da6", + "sha256:e00ea4fdde970725482f1f35630d12f074e121a23801aabf2ae154ec6bdd343a" + ], + "index": "pypi", + "version": "==2.7.1" }, "pytz": { "hashes": [ @@ -991,6 +1073,22 @@ "index": "pypi", "version": "==2019.1" }, + "pyyaml": { + "hashes": [ + "sha256:57acc1d8533cbe51f6662a55434f0dbecfa2b9eaf115bede8f6fd00115a0c0d3", + "sha256:588c94b3d16b76cfed8e0be54932e5729cc185caffaa5a451e7ad2f7ed8b4043", + "sha256:68c8dd247f29f9a0d09375c9c6b8fdc64b60810ebf07ba4cdd64ceee3a58c7b7", + "sha256:70d9818f1c9cd5c48bb87804f2efc8692f1023dac7f1a1a5c61d454043c1d265", + "sha256:86a93cccd50f8c125286e637328ff4eef108400dd7089b46a7be3445eecfa391", + "sha256:a0f329125a926876f647c9fa0ef32801587a12328b4a3c741270464e3e4fa778", + "sha256:a3c252ab0fa1bb0d5a3f6449a4826732f3eb6c0270925548cac342bc9b22c225", + "sha256:b4bb4d3f5e232425e25dda21c070ce05168a786ac9eda43768ab7f3ac2770955", + "sha256:cd0618c5ba5bda5f4039b9398bb7fb6a317bb8298218c3de25c47c4740e4b95e", + "sha256:ceacb9e5f8474dcf45b940578591c7f3d960e82f926c707788a570b51ba59190", + "sha256:fe6a88094b64132c4bb3b631412e90032e8cfe9745a58370462240b8cb7553cd" + ], + "version": "==5.1.1" + }, "readme-renderer": { "hashes": [ "sha256:bb16f55b259f27f75f640acf5e00cf897845a8b3e4731b5c1a436e4b8529202f", @@ -1021,6 +1119,13 @@ "index": "pypi", "version": "==1.12.0" }, + "smmap2": { + "hashes": [ + "sha256:0555a7bf4df71d1ef4218e4807bbf9b201f910174e6e08af2e138d4e517b4dde", + "sha256:29a9ffa0497e7f2be94ca0ed1ca1aa3cd4cf25a1f6b4f5f87f74b46ed91d609a" + ], + "version": "==2.0.5" + }, "snowballstemmer": { "hashes": [ "sha256:9f3b9ffe0809d174f7047e121431acf99c89a7040f0ca84f94ba53a498e6d0c9" @@ -1077,6 +1182,13 @@ ], "version": "==1.1.3" }, + "stevedore": { + "hashes": [ + "sha256:7be098ff53d87f23d798a7ce7ae5c31f094f3deb92ba18059b1aeb1ca9fec0a0", + "sha256:7d1ce610a87d26f53c087da61f06f9b7f7e552efad2a7f6d2322632b5f932ea2" + ], + "version": "==1.30.1" + }, "tqdm": { "hashes": [ "sha256:14a285392c32b6f8222ecfbcd217838f88e11630affe9006cd0e94c7eff3cb61", @@ -1124,10 +1236,10 @@ }, "zipp": { "hashes": [ - "sha256:8c1019c6aad13642199fbe458275ad6a84907634cc9f0989877ccc4a2840139d", - "sha256:ca943a7e809cc12257001ccfb99e3563da9af99d52f261725e96dfe0f9275bc3" + "sha256:4970c3758f4e89a7857a973b1e2a5d75bcdc47794442f2e2dd4fe8e0466e809a", + "sha256:8a5712cfd3bb4248015eb3b0b3c54a5f6ee3f2425963ef2a0125b8bc40aafaec" ], - "version": "==0.5.1" + "version": "==0.5.2" } } } diff --git a/ext/CurrentDataFlow.svg b/ext/CurrentDataFlow.svg deleted file mode 100644 index aef6d90..0000000 --- a/ext/CurrentDataFlow.svg +++ /dev/null @@ -1,2 +0,0 @@ - -
Gunicorn+Flask
Gunicorn+Flask
POST request
Form Data:
data: fileobj/string
type: image/link

[Not supported by viewer]
Tesseract + pytesseract
(Image processor service)
[Not supported by viewer]
Save to Disk
Save to Disk
Regex parsing
(NLP service)
[Not supported by viewer]
Extracted
Text from Image
[Not supported by viewer]
Search Service
Search Service
Parsed tweet text, username, datetime
Parsed tweet text, username, datetime
Scikit cosine similarity
(NLP service)
[Not supported by viewer]
Same day tweets
Same day tweets
Result
Result

Current Data Flow

<h1><span style="font-weight: normal"><font style="font-size: 55px">Current Data Flow</font></span></h1>
\ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..3db5e22 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,54 @@ +-i https://pypi.org/simple +alabaster==0.7.12 +atomicwrites==1.3.0 +attrs==19.1.0 +autopep8==1.4.4 +babel==2.7.0 +bandit==1.6.2 +bleach==3.1.0 +certifi==2019.6.16 +chardet==3.0.4 +coverage==4.5.3 +docutils==0.14 +gitdb2==2.0.5 +gitpython==2.1.11 +hypothesis==4.26.4 +idna==2.8 +imagesize==1.1.0 +importlib-metadata==0.18 +jinja2==2.10.1 +markupsafe==1.1.1 +more-itertools==7.1.0 +packaging==19.0 +pbr==5.4.0 +pkginfo==1.5.0.1 +pluggy==0.12.0 +py==1.8.0 +pycodestyle==2.5.0 +pygments==2.4.2 +pyparsing==2.4.0 +pytest-cov==2.7.1 +pytest==5.0.1 +pytz==2019.1 +pyyaml==5.1.1 +readme-renderer==24.0 +requests-toolbelt==0.9.1 +requests==2.22.0 +six==1.12.0 +smmap2==2.0.5 +snowballstemmer==1.9.0 +sphinx==2.1.2 +sphinxcontrib-applehelp==1.0.1 +sphinxcontrib-devhelp==1.0.1 +sphinxcontrib-htmlhelp==1.0.2 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.2 +sphinxcontrib-serializinghtml==1.1.3 +stevedore==1.30.1 +tqdm==4.32.2 +twine==1.13.0 +urllib3==1.25.3 +wcwidth==0.1.7 +webencodings==0.5.1 +yapf==0.27.0 +zipp==0.5.2 diff --git a/requirements.txt b/requirements.txt index 05a741d..3e9c88b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,91 +1,62 @@ +-i https://pypi.org/simple +-e git+https://github.com/twintproject/twint.git@ad27650fbc0bf8c3f2c78449088a5ede7239f53a#egg=twint aiodns==2.0.0 -aiohttp==3.5.4 aiohttp-socks==0.2.2 -alabaster==0.7.12 +aiohttp==3.5.4 async-timeout==3.0.1 -atomicwrites==1.3.0 attrs==19.1.0 -autopep8==1.4.4 -Babel==2.7.0 beautifulsoup4==4.7.1 -bleach==3.1.0 cchardet==2.1.4 certifi==2019.6.16 cffi==1.12.3 chardet==3.0.4 -Click==7.0 +click==7.0 cycler==0.10.0 decorator==4.4.0 dnspython==1.16.0 -docutils==0.14 elasticsearch==7.0.2 eventlet==0.25.0 fake-useragent==0.1.11 -Flask==1.0.3 -Flask-Cors==3.0.8 +flask-cors==3.0.8 +flask==1.1.0 geographiclib==1.49 geopy==1.20.0 greenlet==0.4.15 gunicorn==19.9.0 +idna-ssl==1.1.0 ; python_version < '3.7' idna==2.8 -idna-ssl==1.1.0 imageio==2.5.0 -imagesize==1.1.0 -importlib-metadata==0.18 itsdangerous==1.1.0 -Jinja2==2.10.1 +jinja2==2.10.1 joblib==0.13.2 kiwisolver==1.1.0 -MarkupSafe==1.1.1 +markupsafe==1.1.1 monotonic==1.5 -more-itertools==7.1.0 multidict==4.5.2 networkx==2.3 -nltk==3.4.3 +nltk==3.4.4 numpy==1.16.4 -packaging==19.0 pandas==0.24.2 -Pillow==6.0.0 -pkginfo==1.5.0.1 -pluggy==0.12.0 -py==1.8.0 +pillow==6.1.0 pycares==3.0.0 pycodestyle==2.5.0 pycparser==2.19 -Pygments==2.4.2 pyparsing==2.4.0 -PySocks==1.7.0 +pysocks==1.7.0 pytesseract==0.2.7 -pytest==5.0.0 python-dateutil==2.8.0 pytz==2019.1 -PyWavelets==1.0.3 -readme-renderer==24.0 +pywavelets==1.0.3 regex==2019.6.8 requests==2.22.0 -requests-toolbelt==0.9.1 schedule==0.6.0 scikit-learn==0.21.2 scipy==1.3.0 six==1.12.0 -snowballstemmer==1.9.0 soupsieve==1.9.2 -Sphinx==2.1.2 -sphinxcontrib-applehelp==1.0.1 -sphinxcontrib-devhelp==1.0.1 -sphinxcontrib-htmlhelp==1.0.2 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.2 -sphinxcontrib-serializinghtml==1.1.3 -tqdm==4.32.2 -twine==1.13.0 --e git+https://github.com/twintproject/twint.git@c5c6f1d60554cd0ee64ba223850b070553a17e74#egg=twint -typing==3.7.4 -typing-extensions==3.7.4 +typing-extensions==3.7.4 ; python_version < '3.7' +typing==3.7.4 ; python_version < '3.7' urllib3==1.25.3 -wcwidth==0.1.7 -webencodings==0.5.1 -Werkzeug==0.15.4 +werkzeug==0.15.4 yapf==0.27.0 yarl==1.3.0 -zipp==0.5.1 diff --git a/setup.py b/setup.py index 758caec..8899294 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ install_requires=[ "click>=5.1", "Pillow==6.0.0", "pytesseract==0.2.6", "requests==2.22.0", "scikit-learn==0.21.2", "nltk>=3.4.3", - "python-dateutil==2.8.0", + "python-dateutil==2.8.0", "werkzeug==0.15.4", "twint @ git+https://github.com/twintproject/twint.git" ], entry_points={ diff --git a/temp_profile b/temp_profile deleted file mode 100644 index 6687b81..0000000 --- a/temp_profile +++ /dev/null @@ -1,2 +0,0 @@ -export TAG=$(grep -oE "\"(.*?)\"" verifytweet/__init__.py) -export TAG=${TAG:1:5} diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..9ea8087 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,22 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import os +import sys +sys.path.insert(0, + os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..df0d479 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,38 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest +import os + + +@pytest.fixture +def file_path(): + return os.path.abspath('./tests/static/real-tweet.png') + + +@pytest.fixture +def tweet_data(): + import csv + + tweet_list = list() + with open(os.path.abspath('./tests/static/tweets.csv'), + newline='') as csvfile: + csvreader = csv.reader(csvfile) + for row in csvreader: + tweet_list.append(row[10]) + return tweet_list[1:] diff --git a/tests/static/real-tweet.png b/tests/static/real-tweet.png new file mode 100644 index 0000000..9a3eaaf Binary files /dev/null and b/tests/static/real-tweet.png differ diff --git a/tests/static/tweets.csv b/tests/static/tweets.csv new file mode 100644 index 0000000..84b8a59 --- /dev/null +++ b/tests/static/tweets.csv @@ -0,0 +1,6 @@ +id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,user_rt_id,near,geo +1147438570430296064,1147437584450301952,1562405648000,2019-07-06,15:04:08,IST,44196397,elonmusk,Elon Musk,,Sharknado is real,['discovermag'],[],[],178,708,9985,[],[],https://twitter.com/elonmusk/status/1147438570430296064,,,0,,, +1147436241501085698,1147434181082800129,1562405092000,2019-07-06,14:54:52,IST,44196397,elonmusk,Elon Musk,,When sheets come loose all dignity is lost,['daddydiaz3'],[],[],74,265,6624,[],[],https://twitter.com/elonmusk/status/1147436241501085698,,,0,,, +1147434859217870848,1147434859217870848,1562404763000,2019-07-06,14:49:23,IST,44196397,elonmusk,Elon Musk,,Model X as it should be shown https://www.instagram.com/p/BzhWjnZgQSj/?igshid=ly4ynqxne44p …,[],['https://www.instagram.com/p/BzhWjnZgQSj/?igshid=ly4ynqxne44p'],[],709,2039,23236,[],[],https://twitter.com/elonmusk/status/1147434859217870848,,,0,,, +1147434181082800129,1147434181082800129,1562404601000,2019-07-06,14:46:41,IST,44196397,elonmusk,Elon Musk,,What ants must feel like pic.twitter.com/NSsBZXnEvp,[],[],['https://pbs.twimg.com/media/D-yBTguUcAAjOCm.jpg'],1139,11287,107724,[],[],https://twitter.com/elonmusk/status/1147434181082800129,,,0,,, +1147433167860592640,1147405286262628353,1562404360000,2019-07-06,14:42:40,IST,44196397,elonmusk,Elon Musk,,🤣🤣,"['mundanemun', 'cryosphear']",[],[],16,14,507,[],[],https://twitter.com/elonmusk/status/1147433167860592640,,,0,,, diff --git a/tests/test_common.py b/tests/test_common.py new file mode 100644 index 0000000..7c84b29 --- /dev/null +++ b/tests/test_common.py @@ -0,0 +1,55 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest + +from verifytweet import common +from verifytweet import result + +def test_extract_and_parse_empty_input(): + """Test extract_and_parse for empty input + """ + with pytest.raises(TypeError): + common.extract_and_parse() + + +def test_extract_and_parse_invalid_type_input(): + """Test extract_and_parse for invalid input type + """ + with pytest.raises(TypeError): + common.extract_and_parse(1234) + common.extract_and_parse(None) + + +def test_extract_and_parse_invalid_input(): + """Test extract_and_parse for invalid file path + """ + with pytest.raises(ValueError): + common.extract_and_parse('') + module_result, result_status = common.extract_and_parse('123') + assert result_status == result.ResultStatus.MODULE_FAILURE + + +def test_extract_and_parse_valid_input(file_path): + """Test extract_and_parse for valid file path + """ + module_result, result_status = common.extract_and_parse(file_path) + assert result_status == result.ResultStatus.ALL_OKAY + assert isinstance(module_result, dict) + assert module_result['user_id'] == 'pewdiepie' + assert module_result['tweet'] == 'ey send me stolen pdp wave designs' diff --git a/tests/test_controller.py b/tests/test_controller.py new file mode 100644 index 0000000..bf23146 --- /dev/null +++ b/tests/test_controller.py @@ -0,0 +1,67 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest + +from verifytweet import controller +from verifytweet import result + +non_api_approach = controller.NonAPIApproach() + + +def test_exec_empty_input(): + """Test exec for empty input + """ + with pytest.raises(TypeError): + non_api_approach.exec() + + +def test_exec_invalid_type_input(): + """Test exec for invalid type input + """ + with pytest.raises(TypeError): + non_api_approach.exec(None) + non_api_approach.exec(123) + non_api_approach.exec(['/home/']) + + +def test_exec_invalid_input(): + """Test exec for valid type invalid input + """ + with pytest.raises(ValueError): + non_api_approach.exec('') + assert non_api_approach.exec( + '123')[1] == result.ResultStatus.MODULE_FAILURE + assert non_api_approach.exec( + '/home')[1] == result.ResultStatus.MODULE_FAILURE + assert non_api_approach.exec( + 'tmp.png')[1] == result.ResultStatus.MODULE_FAILURE + + +def test_exec_valid_input(file_path): + """Test exec for valid input + """ + from twint.tweet import tweet + + test_result_tweet = 'ey send me stolen pdp wave designs' + test_result_username = 'pewdiepie' + module_result, module_status = non_api_approach.exec(file_path) + assert module_status == result.ResultStatus.ALL_OKAY + assert isinstance(module_result, tweet) + assert test_result_tweet in module_result.tweet + assert test_result_username == module_result.username diff --git a/tests/test_date_checker.py b/tests/test_date_checker.py new file mode 100644 index 0000000..a010c74 --- /dev/null +++ b/tests/test_date_checker.py @@ -0,0 +1,82 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest +import datetime + +from verifytweet import date_checker + + +def test_valid_date_empty_input(): + """Test for empty input in valid_date + """ + with pytest.raises(TypeError): + date_checker.valid_date() + + +def test_valid_date_invalid_type_input(): + """Test for invalid type input in valid_date + """ + assert date_checker.valid_date(None) == False + assert date_checker.valid_date('2018/02/23') == False + assert date_checker.valid_date(2018) == False + + +def test_valid_date_invalid_input(): + """Test for invalid input in valid_date + """ + test_date_str = 'Jan 01 1970 7:40AM' + test_date_obj = datetime.datetime.strptime( + test_date_str, + '%b %d %Y %I:%M%p').replace(tzinfo=datetime.timezone.utc) + assert date_checker.valid_date(test_date_obj) == False + + +def test_valid_date_valid_input(): + """Test for valid date in valid_date + """ + test_date_obj = datetime.datetime.now(datetime.timezone.utc) + assert date_checker.valid_date(test_date_obj) == True + + +def test_format_for_date_empty_input(): + """Test for empty input in format_for_date + """ + with pytest.raises(TypeError): + date_checker.format_for_date() + + +def test_format_for_date_invalid_type_input(): + """Test for invalid type input in format_for_date + """ + with pytest.raises(TypeError): + date_checker.format_for_date(None) + date_checker.format_for_date('2018/02/23') + date_checker.format_for_date(2018) + + +def test_format_for_date_valid_input(): + """Test for valid input in format_for_date + """ + test_date_obj = datetime.datetime.now(datetime.timezone.utc) + test_date_str = date_checker.format_for_date(test_date_obj) + formatted_date_obj = datetime.datetime.strptime( + test_date_str, '%Y-%m-%d').replace(tzinfo=datetime.timezone.utc) + assert test_date_obj.year == formatted_date_obj.year + assert test_date_obj.month == formatted_date_obj.month + assert test_date_obj.day == formatted_date_obj.day diff --git a/tests/test_image_service.py b/tests/test_image_service.py new file mode 100644 index 0000000..d9b687d --- /dev/null +++ b/tests/test_image_service.py @@ -0,0 +1,58 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest + +from verifytweet import image as image_service +from verifytweet import result + +extractor = image_service.Extractor() + +def test_get_text_empty_input(): + """Test get_text for empty input + """ + with pytest.raises(TypeError): + extractor.get_text() + + +def test_get_text_invalid_type_input(): + """Test get_text for invalid type input + """ + with pytest.raises(TypeError): + extractor.get_text(None) + extractor.get_text(123) + extractor.get_text(['123']) + + +def test_get_text_invalid_input(): + """Test get_text for valid type invalid input + """ + with pytest.raises(ValueError): + extractor.get_text('') + assert extractor.get_text('123')[1] == result.ResultStatus.MODULE_FAILURE + assert extractor.get_text('/home')[1] == result.ResultStatus.MODULE_FAILURE + assert extractor.get_text('tmp.')[1] == result.ResultStatus.MODULE_FAILURE + + +def test_get_text_valid_input(file_path): + """Test get_text for valid input + """ + test_result = "ey send me stolen pdp wave designs" + module_result, module_status = extractor.get_text(file_path) + assert module_status == result.ResultStatus.ALL_OKAY + assert test_result in module_result diff --git a/tests/test_object_mapper.py b/tests/test_object_mapper.py new file mode 100644 index 0000000..a5f044c --- /dev/null +++ b/tests/test_object_mapper.py @@ -0,0 +1,86 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest +import datetime + +from twint.tweet import tweet + +from verifytweet import object_mapper + +def test_map_keys_empty_input(): + """Test for empty input in map_keys + """ + with pytest.raises(TypeError): + object_mapper.map_keys() + +def test_map_keys_invalid_type_input(): + """Test for invalid type input in map_keys + """ + with pytest.raises(TypeError): + object_mapper.map_keys(None) + object_mapper.map_keys(dict({ + "id": "dummy", + "tweet": "dummy" + })) + object_mapper.map_keys(["id", "dummy"]) + +def test_map_keys_invalid_input(): + """Test for valid type, invalid input in map_keys + """ + test_tweet_obj = tweet() + with pytest.raises(ValueError): + object_mapper.map_keys(test_tweet_obj) + +def test_map_keys_valid_input(): + """Test for valid type, valid input in map_keys + """ + test_tweet_obj = tweet() + test_id = "1234" + test_username = "twitter" + test_tweet = "Hello World!" + test_result = dict({ + "id": test_id, + "username": test_username, + "tweet": test_tweet + }) + test_tweet_obj.id = test_id + test_tweet_obj.conversation_id = str() + test_tweet_obj.username = test_username + test_tweet_obj.datetime = datetime.datetime.now() + test_tweet_obj.datestamp = str() + test_tweet_obj.timestamp = str() + test_tweet_obj.user_id = str() + test_tweet_obj.name = str() + test_tweet_obj.place = None + test_tweet_obj.timezone = str() + test_tweet_obj.mentions = list() + test_tweet_obj.urls = list() + test_tweet_obj.photos = list() + test_tweet_obj.video = list() + test_tweet_obj.tweet = test_tweet + test_tweet_obj.hashtags = list() + test_tweet_obj.replies_count = str() + test_tweet_obj.retweets_count = str() + test_tweet_obj.likes_count = str() + test_tweet_obj.link = str() + test_tweet_obj.retweet = str() + result, module_status = object_mapper.map_keys(test_tweet_obj) + assert result["id"] == test_result["id"] + assert result["username"] == test_result["username"] + assert result["tweet"] == test_result["tweet"] diff --git a/tests/test_search_service.py b/tests/test_search_service.py new file mode 100644 index 0000000..a54ee72 --- /dev/null +++ b/tests/test_search_service.py @@ -0,0 +1,71 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest +import datetime + +from verifytweet import search as search_service +from verifytweet import result + +twitter_api_search = search_service.TwitterAPISearch() +twint_search = search_service.TwintSearch() + + +def test_search_empty_input(): + """Test search for empty input + """ + with pytest.raises(TypeError): + twint_search.search() + + +def test_search_invalid_type_input(): + """Test search for invalid type input + """ + with pytest.raises(TypeError): + twint_search.search(123) + twint_search.search('123') + twint_search.search(None, None, None) + twint_search.search(['123'], [123], ['123']) + twint_search.search('123', None, None) + twint_search.search('123', '2019-07-06', 123) + + +def test_search_invalid_input(): + """Test search for valid type invalid input + """ + with pytest.raises(ValueError): + twint_search.search( + '', + '', + datetime.datetime.now()) + + +def test_search_valid_input(): + """Test search for valid input + """ + test_user_id = 'elonmusk' + test_datetime = datetime.datetime.strptime('2019-07-06', '%Y-%m-%d') + test_tweet_snippet = 'Sharknado' + test_tweet = 'Sharknado is real' + module_result, module_status = twint_search.search(test_user_id, + test_tweet_snippet, + test_datetime) + assert module_status == result.ResultStatus.ALL_OKAY + assert len(module_result) > 0 + assert isinstance(module_result[0].tweet, str) + assert test_tweet in module_result[0].tweet diff --git a/tests/test_text_service.py b/tests/test_text_service.py new file mode 100644 index 0000000..d45062f --- /dev/null +++ b/tests/test_text_service.py @@ -0,0 +1,157 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest +import datetime + +from verifytweet import text as text_service +from verifytweet import result + +data_parser = text_service.DataParser() +text_processor = text_service.TextProcessor() + + +def test_get_entities_empty_input(): + """Test get_entities for empty input + """ + with pytest.raises(TypeError): + data_parser.get_entities() + + +def test_get_entities_invalid_type_input(): + """Test get_entities for invalid type input + """ + with pytest.raises(TypeError): + data_parser.get_entities({1}) + data_parser.get_entities(None) + data_parser.get_entities(['123']) + + +def test_get_entities_invalid_input(): + """Test get_entities for valid type invalid input + """ + with pytest.raises(ValueError): + data_parser.get_entities('') + + +def test_get_entities_valid_input(): + """Test get_entities for valid type valid extracted string + """ + test_extracted_text = """ + + Elon Musk @ + © @elonmusk CC ¥ + Ms. Tree caught the Falcon fairing!! + + 1:21 AM - 25 Jun 2019 + + + + 2,174 Retweets 42,613 Likes oO ome C wo + + © 10K fT) 22K © 48K M4 + + """ + test_result_user_id = 'elonmusk' + test_result_tweet = 'Ms. Tree caught the Falcon fairing!!' + test_result_datetime = datetime.datetime(2019, + 6, + 25, + 1, + 21, + tzinfo=datetime.timezone.utc) + module_result, module_status = data_parser.get_entities( + test_extracted_text) + assert module_status == result.ResultStatus.ALL_OKAY + assert test_result_user_id == module_result['user_id'] + assert test_result_tweet in module_result['tweet'] + assert test_result_datetime == module_result['date'] + + +def test_clean_text_empty_input(): + """Test clean_text for empty input + """ + with pytest.raises(TypeError): + data_parser.clean_text() + + +def test_clean_text_invalid_type_input(): + """Test clean_text for invalid type input + """ + with pytest.raises(TypeError): + data_parser.clean_text(None) + data_parser.clean_text(123) + data_parser.clean_text([1, '2', '3']) + + +def test_clean_text_invalid_input(): + """Test clean_text for valid type invalid input + """ + with pytest.raises(ValueError): + data_parser.clean_text('') + + +def test_clean_text_valid_input(): + """Test clean_text for valid input + """ + test_str = "Ms. Tree caught the Falcon fairing!!" + module_result, module_status = data_parser.clean_text(test_str) + assert module_status == result.ResultStatus.ALL_OKAY + assert module_result == "caught Falcon fairing" + + +def test_get_similarity_empty_input(): + """Test get_similarity for empty input + """ + with pytest.raises(TypeError): + text_processor.get_similarity() + + +def test_get_similarity_invalid_type_input(): + """Test get_similarity for invalid type input + """ + with pytest.raises(TypeError): + text_processor.get_similarity(123, 123) + text_processor.get_similarity(None, None) + text_processor.get_similarity(None, 123) + text_processor.get_similarity([], '') + + +def test_get_similarity_invalid_input(): + """Test get_similarity for valid type invalid input + """ + with pytest.raises(ValueError): + text_processor.get_similarity('', []) + + +def test_get_similarity_valid_input(tweet_data): + """Test get_similarity for valid input + """ + from numpy import allclose, array + + test_extracted_text = "What ants must feel like pic.twitter.com/NSsBZXnEvp" + test_result = array([[1., 0., 0., 0.09245003, 1., 0.], + [0., 1., 0.20412415, 0., 0., 0.], + [0., 0.20412415, 1., 0., 0., 0.], + [0.09245003, 0., 0., 1., 0.09245003, 0.], + [1., 0., 0., 0.09245003, 1., 0.], + [0., 0., 0., 0., 0., 0.]]) + module_result, module_status = text_processor.get_similarity( + test_extracted_text, tweet_data) + assert module_status == result.ResultStatus.ALL_OKAY + assert allclose(module_result, test_result) diff --git a/tests/test_uploader.py b/tests/test_uploader.py new file mode 100644 index 0000000..b3c3fe5 --- /dev/null +++ b/tests/test_uploader.py @@ -0,0 +1,61 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest + +from werkzeug.datastructures import FileStorage + +from verifytweet import uploader +from verifytweet import settings + + +def test_save_to_disk_empty_input(): + """Test save to disk for empty input + """ + with pytest.raises(TypeError): + uploader.save_to_disk() + + +def test_save_to_disk_invalid_type_input(): + """Test save to disk for invalid input type + """ + with pytest.raises(TypeError): + uploader.save_to_disk('') + uploader.save_to_disk('123') + uploader.save_to_disk(None) + uploader.save_to_disk(123) + + +def test_save_to_disk_invalid_input(): + """Test save to disk for invalid input of valid type + """ + test_file_obj = FileStorage('123') + with pytest.raises(ValueError): + uploader.save_to_disk(test_file_obj) + + +def test_save_to_disk_valid_input(file_path): + """Test save to disk for valid file object + """ + app_config = settings.app_config + app_config.ALLOWED_EXTENSIONS = set(["png", "jpg", "jpeg"]) + + with open(file_path, 'rb') as f: + test_file_obj = FileStorage(f) + test_file_name = uploader.save_to_disk(test_file_obj) + assert isinstance(test_file_name, str) diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 0000000..58324ad --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,68 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pytest +import numpy + +from verifytweet import validator +from verifytweet import settings +from verifytweet import result + +app_config = settings.app_config +app_config.SIMILARITY_THRESHOLD = 0.6 + +def test_validator_empty_input(): + """Test verify validity for empty input + """ + with pytest.raises(TypeError): + validator.verify_validity() + + +def test_validator_invalid_type_input(): + """Test verify validity for invalid type input + """ + with pytest.raises(TypeError): + validator.verify_validity(None) + validator.verify_validity(list()) + validator.verify_validity([[]]) + + +def test_validator_invalid_input(): + """Test verify validity for invalid input + """ + test_numpy_array = numpy.array([[None, None], [None, None]]) + with pytest.raises(ValueError): + validator.verify_validity(test_numpy_array) + + +def test_validator_valid_similarity_matrix(): + """Test verfiy validity for valid similarity matrix + """ + test_numpy_array = numpy.array([[1., 0.7, 0.6], [0.5, 0.1, 1.]]) + module_result, match_index, result_status = validator.verify_validity(test_numpy_array) + assert result_status == result.ResultStatus.ALL_OKAY + assert module_result == True + + +def test_validator_invalid_similarity_matrix(): + """Test verfiy validity for valid similarity matrix + """ + test_numpy_array = numpy.array([[0.1, 0.1], [0.1, 0.1]]) + module_result, match_index, result_status = validator.verify_validity(test_numpy_array) + assert result_status == result.ResultStatus.ALL_OKAY + assert module_result == False diff --git a/verifytweet/__init__.py b/verifytweet/__init__.py index 420488e..185667b 100644 --- a/verifytweet/__init__.py +++ b/verifytweet/__init__.py @@ -16,4 +16,17 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -__version__ = "0.5.0" \ No newline at end of file +from .config import settings +from .services import controller +from .services import image +from .services import search +from .services import text +from .util import date_checker +from .util import logging +from .util import object_mapper +from .util import result +from .util import uploader +from .util import validator +from .util import common + +__version__ = "0.5.1" diff --git a/verifytweet/app.py b/verifytweet/app.py index 69f8d61..9490793 100644 --- a/verifytweet/app.py +++ b/verifytweet/app.py @@ -21,14 +21,13 @@ from flask import Flask, jsonify, request from flask_cors import CORS -import verifytweet.services.controller as controller -import verifytweet.services.image as image_service -import verifytweet.util.uploader as image_uploader -import verifytweet.util.object_mapper as object_mapper - -from verifytweet.util.logging import logger -from verifytweet.config.settings import app_config -from verifytweet.util.result import ResultStatus +from .services import controller +from .services import image as image_service +from .util import uploader as image_uploader +from .util import object_mapper +from .util.logging import logger +from .config.settings import app_config +from .util.result import ResultStatus router = Flask(__name__, static_folder=app_config.FILE_DIRECTORY) router.config['MAX_CONTENT_LENGTH'] = app_config.MAX_CONTENT_LENGTH @@ -74,26 +73,17 @@ def verify_tweet(): return "Missing form fields", 400 try: file_path = image_uploader.save_to_disk(request_image) - rest_controller = controller.NonAPIApproach(file_path) + rest_controller = controller.NonAPIApproach() + result, controller_status = rest_controller.exec(file_path) except Exception as e: logger.exception(e) return jsonify({ 'status': ResultStatus.MODULE_FAILURE.value, 'result': None }) - result, controller_status = rest_controller.exec() if controller_status != ResultStatus.ALL_OKAY: - return jsonify({ - 'status': controller_status.value, - 'result': result - }) + return jsonify({'status': controller_status.value, 'result': result}) tweet_dict, mapper_status = object_mapper.map_keys(result) if mapper_status != ResultStatus.ALL_OKAY: - return jsonify({ - 'status': mapper_status.value, - 'result': tweet_dict - }) - return jsonify({ - 'status': mapper_status.value, - 'result': tweet_dict - }) + return jsonify({'status': mapper_status.value, 'result': tweet_dict}) + return jsonify({'status': mapper_status.value, 'result': tweet_dict}) diff --git a/verifytweet/cli.py b/verifytweet/cli.py index 2aecf86..46d329a 100644 --- a/verifytweet/cli.py +++ b/verifytweet/cli.py @@ -17,11 +17,8 @@ # along with this program. If not, see . import os - import click -os.environ["VERIFYTWEET_RUN_FROM_CLI"] = "true" - from .services import controller from .config.settings import app_config from .util.logging import logger @@ -53,16 +50,16 @@ def run_as_command(filepath): """ try: - verify_controller = controller.NonAPIApproach(filepath) + verify_controller = controller.NonAPIApproach() + tweet_obj, controller_status = verify_controller.exec(filepath) + if controller_status == ResultStatus.MODULE_FAILURE: + print(f"Something went wrong, Please try again!") + elif controller_status == ResultStatus.NO_RESULT: + print(f"Fake Tweet!") + else: + print(f"\nVerified Tweet!") + print( + f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****" + ) except Exception as e: logger.exception(e) - tweet_obj, controller_status = verify_controller.exec() - if controller_status == ResultStatus.MODULE_FAILURE: - print(f"Something went wrong, Please try again!") - elif controller_status == ResultStatus.NO_RESULT: - print(f"Fake Tweet!") - else: - print(f"\nVerified Tweet!") - print( - f"**** Username: {tweet_obj.username} ****\n**** Tweet: {tweet_obj.tweet} ****\n**** Likes: {tweet_obj.likes_count} ****\n**** Retweets: {tweet_obj.retweets_count} ****\n**** Link: {tweet_obj.link} ****" - ) diff --git a/verifytweet/config/__init__.py b/verifytweet/config/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/verifytweet/config/settings.py b/verifytweet/config/settings.py index edb8dbc..3d4dcde 100644 --- a/verifytweet/config/settings.py +++ b/verifytweet/config/settings.py @@ -43,7 +43,8 @@ class Config(object): FILE_DIRECTORY = tempfile.mkdtemp() TWEET_MAX_STORE = 150 RUN_METHOD = "cli" - LOG_LEVEL = logging.INFO + LOG_LEVEL = logging.DEBUG if os.getenv('DEBUG') else logging.INFO + SIMILARITY_THRESHOLD = 0.6 class TwitterAPIConfig(Config): @@ -60,7 +61,6 @@ class TwitterAPIConfig(Config): TWEET_COUNT_KEY = "count" TWEET_MAX_OLD = 7 TWEET_TEXT_KEY = "text" - SIMILARITY_THRESHOLD = 0.6 class WebConfig(Config): @@ -77,7 +77,7 @@ class WebConfig(Config): ALLOWED_EXTENSIONS = set(["png", "jpg", "jpeg"]) -run_method = "cli" if "VERIFYTWEET_RUN_FROM_CLI" in os.environ else "web" +run_method = "web" if "VERIFYTWEET_RUN_FOR_WEB" in os.environ else "cli" Config.RUN_METHOD = run_method configurations = {"web": WebConfig, "cli": Config} diff --git a/verifytweet/services/__init__.py b/verifytweet/services/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/verifytweet/services/controller.py b/verifytweet/services/controller.py index 94aa861..0266fa2 100644 --- a/verifytweet/services/controller.py +++ b/verifytweet/services/controller.py @@ -16,11 +16,13 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import os + import verifytweet.services.image as image_service import verifytweet.services.text as text_service import verifytweet.services.search as search_service import verifytweet.util.date_checker as date_checker -import verifytweet.util.validator as validator +import verifytweet.util.common as common from verifytweet.util.logging import logger from verifytweet.config.settings import app_config @@ -29,19 +31,12 @@ class APIApproach(object): """Use Twitter API to verify tweet - - Attributes: - file_path: A string denoting a twitter username. """ - def __init__(self, file_path: str): - if not isinstance(file_path, str): - raise TypeError('File path must be type str') - if not file_path: - raise ValueError('File path must be a valid string') - self.file_path = file_path + def __init__(self): + pass - def exec(self): + def exec(self, file_path: str): """Executes controller flow Controller uses image service to extract text from @@ -50,64 +45,51 @@ def exec(self): to retrieve same day tweets, text service to find similar tweet and finally verifying the tweet. + Attributes: + file_path: A string denoting a twitter username. + Returns: valid_tweet: A tweet object status: Enum ResultStatus representing result status """ - entities, preprocess_status = preprocess(self.file_path) + if not isinstance(file_path, str): + raise TypeError('File path must be type str') + if not file_path: + raise ValueError('File path must be a valid string') + entities, preprocess_status = common.extract_and_parse(file_path) if preprocess_status != ResultStatus.ALL_OKAY: return (None, ResultStatus.MODULE_FAILURE) try: - search_controller = search_service.TwitterAPISearch( + search_controller = search_service.TwitterAPISearch() + same_day_tweets, search_status = search_controller.aggregate_tweets( entities['user_id'], entities['date']) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) - same_day_tweets, search_status = search_controller.aggregate_tweets() if search_status != ResultStatus.ALL_OKAY: return (None, search_status) - - try: - text_processor = text_service.TextProcessor( - entities['tweet'], same_day_tweets) - except Exception as e: - logger.exception(e) - return (None, ResultStatus.MODULE_FAILURE) - similarity_matrix, processor_status = text_processor.get_similarity() - if processor_status != ResultStatus.ALL_OKAY: - return (None, processor_status) - - try: - valid_tweet, validator_status = validator.verify_validity( - similarity_matrix) - except Exception as e: - logger.exception(e) - return (None, ResultStatus.MODULE_FAILURE) + validity, match_index, validator_status = common.calculate_and_validate( + entities=entities, same_day_tweets=same_day_tweets) if validator_status != ResultStatus.ALL_OKAY: - return (None, validator_status) - logger.info('Tweet Validity: ' + str(valid_tweet)) - return (valid_tweet, ResultStatus.ALL_OKAY) + return (None, ResultStatus.MODULE_FAILURE) + return (same_day_tweets[match_index], ResultStatus.ALL_OKAY) class NonAPIApproach(object): """Use a non-api approach to verify tweet - - Attributes: - file_path: A string denoting a twitter username. """ - def __init__(self, file_path: str): - if not isinstance(file_path, str): - raise TypeError('File path must be type str') - if not file_path: - raise ValueError('File path must be a valid string') - self.file_path = file_path + def __init__(self): + pass - def exec(self): + def exec(self, file_path): """Executes controller flow + Attributes: + file_path: A string denoting a twitter username. + Controller uses image service to extract text from image, passes text to text service to parse entities such as username, tweet as well as date, uses search service @@ -118,63 +100,40 @@ def exec(self): status: Enum ResultStatus representing result status """ - entities, preprocess_status = preprocess(self.file_path) + if not isinstance(file_path, str): + raise TypeError('File path must be type str') + if not file_path: + raise ValueError('File path must be a valid string') + entities, preprocess_status = common.extract_and_parse(file_path) if preprocess_status != ResultStatus.ALL_OKAY: return (None, ResultStatus.MODULE_FAILURE) try: - text_processor = text_service.DataParser(entities['tweet']) + text_processor = text_service.DataParser() + tweet_snippet, text_processor_status = text_processor.clean_text( + entities['tweet']) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) - tweet_snippet, text_processor_status = text_processor.clean_text() if text_processor_status != ResultStatus.ALL_OKAY: return (None, text_processor_status) try: search_controller = search_service.TwintSearch() search_results, search_status = search_controller.search( - entities['user_id'], entities['date'], tweet_snippet) + entities['user_id'], tweet_snippet, entities['date']) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if search_status != ResultStatus.ALL_OKAY: return (None, search_status) - + if not entities['date']: + same_day_tweets = list() + for tweet_obj in search_results: + same_day_tweets.append(tweet_obj.tweet) + validity, match_index, validator_status = common.calculate_and_validate( + entities=entities, same_day_tweets=same_day_tweets) + if validator_status != ResultStatus.ALL_OKAY: + return (None, ResultStatus.MODULE_FAILURE) + return (search_results[match_index], ResultStatus.ALL_OKAY) return (search_results[0], ResultStatus.ALL_OKAY) - - -def preprocess(file_path): - """Preprocesses text - - Extracts text from image using image service, - parses entities from text using text service. - - Args: - file_path: represents path of the image file. - - Returns: - entities: Entities parsed from text such as tweet, user_id and date. - status: Enum ResultStatus representing result status - - """ - try: - text_extractor = image_service.Extractor(file_path) - except Exception as e: - logger.exception(e) - return (None, ResultStatus.MODULE_FAILURE) - extracted_text, extractor_status = text_extractor.get_text() - if extractor_status != ResultStatus.ALL_OKAY: - return (None, extractor_status) - logger.debug('Processed text: ' + extracted_text) - - try: - entity_parser = text_service.DataParser(extracted_text) - except Exception as e: - logger.exception(e) - return (None, ResultStatus.MODULE_FAILURE) - entities, parser_status = entity_parser.get_entities() - if parser_status != ResultStatus.ALL_OKAY: - return (None, parser_status) - logger.debug('Entities: ' + str(entities)) - return (entities, parser_status) diff --git a/verifytweet/services/image.py b/verifytweet/services/image.py index b8ce23f..ccf098c 100644 --- a/verifytweet/services/image.py +++ b/verifytweet/services/image.py @@ -16,7 +16,9 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +import os import subprocess +import uuid import PIL import pytesseract @@ -35,22 +37,26 @@ class Extractor(object): file_path: A string indicating file path where the image is stored. """ - def __init__(self, file_path: str): + def __init__(self): + pass + + def get_text(self, file_path: str): + """Extracts text from image + """ if not isinstance(file_path, str): raise TypeError('File path must be type string') if not file_path: raise ValueError('File path cannot be empty') - self.file_path = file_path - - def get_text(self): - """Extracts text from image - """ logger.info('Processing Image...') - new_file_path = self.rescale(self.file_path) - logger.info('Extracting text from rescaled image...') try: + new_file_path = self.rescale(file_path) + logger.info('Extracting text from rescaled image...') img = PIL.Image.open(new_file_path) text = pytesseract.image_to_string(image=img) + try: + os.remove(new_file_path) + except Exception as e: + logger.exception(e) if not text: return (None, ResultStatus.NO_RESULT) return (text, ResultStatus.ALL_OKAY) @@ -60,12 +66,18 @@ def get_text(self): @staticmethod def rescale(file_path): + if not isinstance(file_path, str): + raise TypeError('File path must be type string') + if not file_path: + raise ValueError('File path cannot be empty') logger.info('Rescaling Image to 300 dpi...') - new_file_path = file_path.rsplit('.', 1)[0] + '.png' + new_file_path = os.path.join(app_config.FILE_DIRECTORY, + str(uuid.uuid1()) + '.png') cmd = [ 'convert', file_path, '-resample', app_config.UPSCALE_RESOLUTION, '-alpha', 'off', '-colorspace', 'Gray', '-threshold', '75%', new_file_path ] - subprocess.run(cmd) + completed_process = subprocess.run(cmd) + completed_process.check_returncode() return new_file_path diff --git a/verifytweet/services/search.py b/verifytweet/services/search.py index 3f7623c..8db6f8c 100644 --- a/verifytweet/services/search.py +++ b/verifytweet/services/search.py @@ -40,18 +40,10 @@ class TwitterAPISearch(object): date: A datetime object representing the date in question. """ - def __init__(self, user_id: str, date: datetime.datetime): - if not isinstance(user_id, str) or not isinstance( - date, datetime.datetime): - raise TypeError( - 'User ID must be type string and date must be type datetime.datetime' - ) - if not user_id or not date: - raise ValueError('User ID or Date cannot be empty') - self.user_id = user_id - self.date = date + def __init__(self): + pass - def aggregate_tweets(self): + def aggregate_tweets(self, user_id: str, date: datetime.datetime): """Aggregates tweets from a single day. Retrieves tweets pertaining to the given username and date using Twitter Search API. @@ -70,9 +62,16 @@ def aggregate_tweets(self): } """ + if not isinstance(user_id, str) or not isinstance( + date, datetime.datetime): + raise TypeError( + 'User ID must be type string and date must be type datetime.datetime' + ) + if not user_id or not date: + raise ValueError('User ID or Date cannot be empty') logger.info('Searching for tweet using Twitter API...') querystring = dict({ - app_config.TWEET_USERNAME_KEY: self.user_id, + app_config.TWEET_USERNAME_KEY: user_id, app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT }) try: @@ -87,10 +86,10 @@ def aggregate_tweets(self): tweet_date = date_parser.parse(entry[app_config.TWEET_DATE_KEY]) if date_checker.format_for_date( tweet_date) == date_checker.format_for_date( - self.date) and date_checker.valid_date(tweet_date): + date) and date_checker.valid_date(tweet_date): logger.debug('Tweet found...: ' + str(entry[app_config.TWEET_TEXT_KEY])) - same_day_tweets.append(entry[app_config.TWEET_TEXT_KEY]) + same_day_tweets.append(entry) if not same_day_tweets: return (same_day_tweets, ResultStatus.NO_RESULT) return (same_day_tweets, ResultStatus.ALL_OKAY) @@ -131,8 +130,8 @@ class TwintSearch(object): def __init__(self): pass - def search(self, user_id: str, date: datetime.datetime, - tweet_snippet: str): + def search(self, user_id: str, tweet_snippet: str, + date: datetime.datetime = None): """Searches for tweets Retrieves tweets of given username, date as well as tweet snippet using Twint. @@ -146,25 +145,28 @@ def search(self, user_id: str, date: datetime.datetime, ([], ResultStatus.ALL_OKAY) """ - if not isinstance(user_id, str) or not isinstance( - date, datetime.datetime) or not (tweet_snippet, str): + if not isinstance(user_id, str) or not (tweet_snippet, str): raise TypeError( 'User ID and tweet_snippet must be type string, date must be type datetime.datetime' ) - if not user_id or not date or not tweet_snippet: + if not user_id or not tweet_snippet: raise ValueError('User ID, Tweet or Date cannot be empty') + results = list() twint_config = twint.Config() twint_config.Username = user_id - twint_config.Search = tweet_snippet - twint_config.Since = date_checker.format_for_date(date) + if date: + twint_config.Since = date_checker.format_for_date(date) + twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=1)) + else: + twint_config.Search = tweet_snippet twint_config.Limit = app_config.TWEET_MAX_STORE twint_config.Store_object = True + twint_config.Store_object_tweets_list = results try: twint.run.Search(twint_config) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) - results = twint.output.tweets_object if not results: return (results, ResultStatus.NO_RESULT) logger.debug(f'Search results: {results}\n') diff --git a/verifytweet/services/text.py b/verifytweet/services/text.py index 4dea408..07ddcde 100644 --- a/verifytweet/services/text.py +++ b/verifytweet/services/text.py @@ -35,26 +35,26 @@ count_vectorizer = CountVectorizer() stopwords = set(nltk.corpus.stopwords.words('english')) +USERNAME_REGEX = r'@(\w{1,15})\b' +DATETIME_REGEX = r'((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))\s-\s\d{1,2}\s\w+\s\d{4}' +ALPHANUM_REGEX = r'[^A-Za-z0-9]+' + class DataParser(object): """Parses data from extracted text - - Attributes: - extracted_text: A string denoting extracted text from image. """ - def __init__(self, extracted_text: str): - if not isinstance(extracted_text, str): - raise TypeError('Extracted text must be type string') - if not extracted_text: - raise ValueError('Extracted text cannot be empty') - self.text = extracted_text + def __init__(self): + pass - def get_entities(self): + def get_entities(self, extracted_text: str): """Parses entities from extracted text. Parses username (denoted by user_id), tweet as well as date from extracted text. + Attributes: + extracted_text: A string denoting extracted text from image. + Returns: A tuple contaning a dictionary: a mapping of user_id, tweet and date as well as Enum ResultStatus which gives out result status. @@ -67,47 +67,64 @@ def get_entities(self): } """ + if not isinstance(extracted_text, str): + raise TypeError('Extracted text must be type string') + if not extracted_text: + raise ValueError('Extracted text cannot be empty') logger.info('Parsing data out of extracted text...') - username_match = re.search(r'@(\w{1,15})\b', self.text) - datetime_match = re.search( - r'((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))\s-\s\d{1,2}\s\w+\s\d{4}', - self.text) - if not username_match or not datetime_match: + username_match = re.search(USERNAME_REGEX, extracted_text) + datetime_match = re.search(DATETIME_REGEX, extracted_text) + if not username_match: return (dict({ 'user_id': None, 'tweet': None, 'datetime': None }), ResultStatus.NO_RESULT) user_id = username_match.group()[1:] + tweet_start_index = username_match.end() + tweet_end_index = len( + extracted_text + ) - 1 if not datetime_match else datetime_match.start() + tweet = extracted_text[tweet_start_index:tweet_end_index].strip() + if not datetime_match: + return (dict({ + 'user_id': user_id, + 'tweet': tweet, + 'date': None + }), ResultStatus.ALL_OKAY) date_str = datetime_match.group().replace('-', '') processed_datetime = date_parser.parse(date_str).replace( tzinfo=datetime.timezone.utc) - username_end_index = username_match.end() - date_start_index = datetime_match.start() - tweet = self.text[username_end_index + 5:date_start_index].strip() return (dict({ 'user_id': user_id, 'tweet': tweet, 'date': processed_datetime }), ResultStatus.ALL_OKAY) - def clean_text(self): + def clean_text(self, extracted_text: str): """Removes stop words and samples words out of tweet to create a snippet. + Attributes: + extracted_text: A string denoting extracted text from image. + Returns: A tuple contaning a tweet snippet as well as Enum ResultStatus which gives out result status. """ + if not isinstance(extracted_text, str): + raise TypeError('Extracted text must be type string') + if not extracted_text: + raise ValueError('Extracted text cannot be empty') try: - non_punc_tweet = self.text.translate( + non_punc_tweet = extracted_text.translate( str.maketrans('', '', string.punctuation)) word_tokens = nltk.tokenize.word_tokenize(non_punc_tweet) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) filtered_sentence = [w for w in word_tokens if not w in stopwords] - picked_words = filtered_sentence[0:min([len(filtered_sentence), 4])] + picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])] tweet_snippet = " ".join(picked_words) if not tweet_snippet: return (tweet_snippet, ResultStatus.NO_RESULT) @@ -117,32 +134,22 @@ def clean_text(self): class TextProcessor(object): """Processes extracted tweet and aggregated tweets - - Attributes: - extracted_tweet: A string denoting extracted tweet from image. - same_day_tweets: A list contaning tweets of target date """ - def __init__(self, extracted_tweet: str, same_day_tweets: list): - if not isinstance(extracted_tweet, str) or not isinstance( - same_day_tweets, list): - raise TypeError( - 'Extracted tweet must be type str and Same day tweets must be type list' - ) - if not extracted_tweet or not same_day_tweets: - raise ValueError( - 'Extracted tweet must be a valid string and same day tweets must be a valid list' - ) - self.extracted_tweet = extracted_tweet - self.same_day_tweets = same_day_tweets + def __init__(self): + pass - def get_similarity(self): + def get_similarity(self, extracted_tweet: str, same_day_tweets: list): """Calculates a similarity matrix. Calculates a similarity matrix of the corpus containing extracted tweet and tweets aggregated from Twitter Search API using consine similarity approach. + Attributes: + extracted_tweet: A string denoting extracted tweet from image. + same_day_tweets: A list contaning tweets of target date + Returns: A tuple contaning a similarity matrix, which is a numpy array as well as Enum ResultStatus which gives out result status. @@ -153,11 +160,20 @@ def get_similarity(self): """ + if not isinstance(extracted_tweet, str) or not isinstance( + same_day_tweets, list): + raise TypeError( + 'Extracted tweet must be type str and Same day tweets must be type list' + ) + if not extracted_tweet or not same_day_tweets: + raise ValueError( + 'Extracted tweet must be a valid string and same day tweets must be a valid list' + ) logger.info('Processing similarity of two tweets...') corpus = list() - corpus.append(self.extracted_tweet) - corpus.extend(self.same_day_tweets) - logger.info('Corpus: ' + str(corpus)) + corpus.append(extracted_tweet) + corpus.extend(same_day_tweets) + logger.debug('Corpus: ' + str(corpus)) try: sparse_matrix = count_vectorizer.fit_transform(corpus) similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix) diff --git a/verifytweet/util/__init__.py b/verifytweet/util/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/verifytweet/util/common.py b/verifytweet/util/common.py new file mode 100644 index 0000000..b5c5f80 --- /dev/null +++ b/verifytweet/util/common.py @@ -0,0 +1,103 @@ +# Verify Tweet verifies tweets of a public user +# from tweet screenshots: real or generated from +# tweet generators. +# Copyright (C) 2019 Preetham Kamidi + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +import verifytweet.services.image as image_service +import verifytweet.services.text as text_service +import verifytweet.util.validator as validator + +from verifytweet.util.logging import logger +from verifytweet.util.result import ResultStatus + + +def extract_and_parse(file_path: str): + """Preprocess text from image + + Extracts text from image using image service, + parses entities from text using text service. + + Args: + file_path: represents path of the image file. + + Returns: + entities: Entities parsed from text such as tweet, user_id and date. + status: Enum ResultStatus representing result status + + """ + if not isinstance(file_path, str): + raise TypeError('File path must be type string') + if not file_path: + raise ValueError('File path must be a valid path') + try: + text_extractor = image_service.Extractor() + extracted_text, extractor_status = text_extractor.get_text(file_path) + except Exception as e: + logger.exception(e) + return (None, ResultStatus.MODULE_FAILURE) + if extractor_status != ResultStatus.ALL_OKAY: + return (None, extractor_status) + logger.debug('Processed text: ' + extracted_text) + + try: + entity_parser = text_service.DataParser() + entities, parser_status = entity_parser.get_entities(extracted_text) + except Exception as e: + logger.exception(e) + return (None, ResultStatus.MODULE_FAILURE) + if parser_status != ResultStatus.ALL_OKAY: + return (None, parser_status) + logger.debug('Entities: ' + str(entities)) + return (entities, parser_status) + + +def calculate_and_validate(entities: dict, same_day_tweets: list): + """Calculates similarity matrix and validates tweet + + Calculates a similarity matrix from same day tweet + corpus using text service and validates tweet + using validator + + Args: + entities: represents dictionary of entities extracted from text + same_day_tweets: list of strings representing same day tweets + + Returns: + valid_tweet: Validity status of tweet + status: Enum ResultStatus representing result status + + """ + try: + text_processor = text_service.TextProcessor() + similarity_matrix, processor_status = text_processor.get_similarity( + entities['tweet'], same_day_tweets) + except Exception as e: + logger.exception(e) + return (None, None, ResultStatus.MODULE_FAILURE) + if processor_status != ResultStatus.ALL_OKAY: + return (None, None, processor_status) + + try: + valid_tweet, match_index, validator_status = validator.verify_validity( + similarity_matrix) + except Exception as e: + logger.exception(e) + return (None, None, ResultStatus.MODULE_FAILURE) + if validator_status != ResultStatus.ALL_OKAY: + return (None, None, validator_status) + logger.debug('Tweet Validity: ' + str(valid_tweet)) + return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY) diff --git a/verifytweet/util/date_checker.py b/verifytweet/util/date_checker.py index bb7ea23..73ea735 100644 --- a/verifytweet/util/date_checker.py +++ b/verifytweet/util/date_checker.py @@ -31,14 +31,18 @@ def valid_date(processed_date): Returns: A Boolean indicating if tweet can be futher processed or not. """ - if not processed_date: + if not processed_date or not isinstance(processed_date, datetime): return False curr_date = datetime.now(timezone.utc) datetime_diff = curr_date - processed_date - if datetime_diff.days > app_config.TWEET_MAX_OLD: + if datetime_diff.days > 7: return False return True def format_for_date(tweet_datetime: datetime): + if not isinstance(tweet_datetime, datetime): + raise TypeError('Tweet date has to be type datetime') + if not tweet_datetime: + raise ValueError('Tweet date has to be a valid datetime object') return tweet_datetime.strftime('%Y-%m-%d') diff --git a/verifytweet/util/logging.py b/verifytweet/util/logging.py index 8fa08b8..63d51ab 100644 --- a/verifytweet/util/logging.py +++ b/verifytweet/util/logging.py @@ -22,10 +22,10 @@ from verifytweet.config.settings import app_config logger = logging.getLogger() -logger.setLevel(logging.INFO) +logger.setLevel(app_config.LOG_LEVEL) handler = logging.StreamHandler(sys.stdout) -handler.setLevel(logging.INFO) +handler.setLevel(app_config.LOG_LEVEL) web_formatter = logging.Formatter(u'%(asctime)s -- %(levelname)s -- %(message)s') cli_formatter = logging.Formatter(u'%(message)s') diff --git a/verifytweet/util/object_mapper.py b/verifytweet/util/object_mapper.py index 6c1a756..265aa7f 100644 --- a/verifytweet/util/object_mapper.py +++ b/verifytweet/util/object_mapper.py @@ -16,6 +16,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from twint.tweet import tweet from verifytweet.config.settings import app_config from verifytweet.util.logging import logger from verifytweet.util.result import ResultStatus @@ -29,8 +30,12 @@ def map_keys(tweet_obj): Returns: A dictionary contaning a mapping of members of tweet object """ - if not tweet_obj: - return (None, ResultStatus.MODULE_FAILURE) + if not isinstance(tweet_obj, tweet): + raise TypeError('Tweet object must be of type twint.tweet') + try: + id = tweet_obj.id + except AttributeError: + raise ValueError('Tweet object must be valid') return (dict({ "id": tweet_obj.id, "conversation_id": tweet_obj.conversation_id, diff --git a/verifytweet/util/uploader.py b/verifytweet/util/uploader.py index 37a17b8..aaca7ff 100644 --- a/verifytweet/util/uploader.py +++ b/verifytweet/util/uploader.py @@ -37,7 +37,7 @@ def save_to_disk(file_obj): raise ValueError('file obj cannot be empty') filename = secure_filename(file_obj.filename) if file_obj and allowed_file(filename): - saved_file_name = str(uuid.uuid4()) + '.' + \ + saved_file_name = str(uuid.uuid1()) + '.' + \ filename.rsplit('.', 1)[1].lower() saved_file_path = os.path.join(app_config.FILE_DIRECTORY, saved_file_name) diff --git a/verifytweet/util/validator.py b/verifytweet/util/validator.py index 95f9c8c..5473234 100644 --- a/verifytweet/util/validator.py +++ b/verifytweet/util/validator.py @@ -16,12 +16,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import numpy +from numpy import ndarray from verifytweet.config.settings import app_config from verifytweet.util.result import ResultStatus -def verify_validity(similarity_matrix): +def verify_validity(similarity_matrix: ndarray): """Verifies validity of a tweet in similarity matrix. Verifies validity of a tweet in similarity matrix, if it crosses @@ -33,8 +33,12 @@ def verify_validity(similarity_matrix): Returns: A Boolean representing validity of the tweet. """ - for row in similarity_matrix: - for column in row: - if column > app_config.SIMILARITY_THRESHOLD: - return (True, ResultStatus.ALL_OKAY) - return (False, ResultStatus.ALL_OKAY) \ No newline at end of file + if not isinstance(similarity_matrix, ndarray): + raise TypeError('Similarity matrix must type numpy.ndarray') + if not similarity_matrix.all(): + raise ValueError('Similarity matrix must be a valid numpy array') + row = similarity_matrix[0] + for column_index in range(1, row.shape[0]): + if row[column_index] > app_config.SIMILARITY_THRESHOLD: + return (True, column_index, ResultStatus.ALL_OKAY) + return (False, None, ResultStatus.ALL_OKAY) diff --git a/wsgi.py b/wsgi.py index 627cdf2..7fa6e39 100644 --- a/wsgi.py +++ b/wsgi.py @@ -24,6 +24,8 @@ import gunicorn.app.base from gunicorn.six import iteritems +os.environ["VERIFYTWEET_RUN_FOR_WEB"] = "true" + from verifytweet.config.settings import app_config from verifytweet.app import router