In [1]:
! pip install flask-ngrok
! pip install Flask-RESTful
! pip install torch==1.5.0
! pip install transformers==2.8.0



In [2]:
! pip install gdown



In [3]:
import gdown
from pathlib import Path
import requests

Path("assets").mkdir(exist_ok=True)

gdown.download(
    "https://drive.google.com/uc?id=1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA",
    "assets/model_state_dict.bin", quiet=False
)

Downloading...
From: https://drive.google.com/uc?id=1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA
To: /content/assets/model_state_dict.bin
100%|██████████| 433M/433M [00:01<00:00, 239MB/s]


'assets/model_state_dict.bin'

In [4]:
import json
from torch import nn
from transformers import BertModel

config = {
    "BERT_MODEL": "bert-base-cased",
    "PRE_TRAINED_MODEL": "assets/model_state_dict.bin",
    "CLASS_NAMES": [
        "negative",
        "neutral",
        "positive"
    ],
    "MAX_SEQUENCE_LEN": 160
}


class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(config["BERT_MODEL"])
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(pooled_output)
        return self.out(output)


In [5]:
import json
import torch
import torch.nn.functional as F
from transformers import BertTokenizer

class Model:
    def __init__(self):

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.tokenizer = BertTokenizer.from_pretrained(config["BERT_MODEL"])

        classifier = SentimentClassifier(len(config["CLASS_NAMES"]))
        classifier.load_state_dict(
            torch.load(config["PRE_TRAINED_MODEL"], map_location=self.device)
        )
        classifier = classifier.eval()
        self.classifier = classifier.to(self.device)

    def predict(self, text):
        encoded_text = self.tokenizer.encode_plus(
            text,
            max_length=config["MAX_SEQUENCE_LEN"],
            add_special_tokens=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        input_ids = encoded_text["input_ids"].to(self.device)
        attention_mask = encoded_text["attention_mask"].to(self.device)

        with torch.no_grad():
            probabilities = F.softmax(self.classifier(input_ids, attention_mask), dim=1)
        confidence, predicted_class = torch.max(probabilities, dim=1)
        predicted_class = predicted_class.cpu().item()
        probabilities = probabilities.flatten().cpu().numpy().tolist()
        return (
            config["CLASS_NAMES"][predicted_class],
            confidence,
            dict(zip(config["CLASS_NAMES"], probabilities)),
        )


model = Model()


def get_model():
    return model

In [6]:
! pip install mongoengine==0.23.1
! pip install dnspython==2.1.0
! pip install pymongo==3.12.1



In [7]:
from mongoengine import connect, disconnect

In [8]:
database_connection_params = {
    'connection_string': 'mongodb+srv://dataminingadmin:fall2021project@cluster1.ngjps.mongodb.net/cnbc',
    'db_name': 'cnbc',
    'user_name': 'dataminingadmin',
    'password': 'fall2021project',
    'collection_name': 'news_articles'}

In [9]:
# connect to database connection through mongoengine
connect(db=database_connection_params['db_name'],
        username=database_connection_params['user_name'],
        password=database_connection_params['password'],
        host=database_connection_params['connection_string'])

MongoClient(host=['cluster1-shard-00-00.ngjps.mongodb.net:27017', 'cluster1-shard-00-02.ngjps.mongodb.net:27017', 'cluster1-shard-00-01.ngjps.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-aoug1h-shard-0', ssl=True, read_preference=Primary())

In [10]:
from mongoengine import Document, StringField, \
    URLField, ListField, DictField


# News Article Document Class
class NewsArticles(Document):
    source_name = StringField(required=True)
    article_title = StringField(required=True)
    article_authors = ListField()
    article_published_date = StringField()
    article_text = StringField(required=True)
    images_link = StringField()
    video_link = ListField()
    article_summary = StringField(required=True)
    article_keywords = ListField()
    article_url = URLField(required=True)
    meta = {'allow_inheritance': True}


# Processed News Article Document Class
class ProcessedNewsArticle(Document):
    cleaned_source_name = StringField(required=True)
    cleaned_article_title = StringField(required=True)
    cleaned_article_authors = ListField()
    cleaned_article_published_date = StringField()
    cleaned_article_text = StringField(required=True)
    cleaned_images_link = StringField()
    cleaned_video_link = ListField()
    cleaned_article_summary = StringField(required=True)
    cleaned_article_keywords = ListField()
    cleaned_article_url = URLField(required=True)
    cleaned_recognized_entity = DictField()
    text_blob_sentiment = DictField()
    vader_sentiment = DictField()
    flair_sentiment = DictField()
    transformers_sentiment = DictField()
    overall_sentiment = StringField()
    overall_article_keywords = ListField()
    overall_article_keywords_dict = DictField()
    meta = {'allow_inheritance': True}



In [11]:
import time
import os
import json
import concurrent

def sentiment_funtion(document):
  current_document = ProcessedNewsArticle.objects(cleaned_article_url=document['article_url']).first()
  if len(current_document.transformers_sentiment) == 0:
    model = Model()
    sentiment, confidence, probabilities = model.predict(document['article_text'])
    
    prediction = {
                  "sentiment": sentiment,
                  "confidence": str(confidence),
                  "probabilities": probabilities
                  }

    current_document.transformers_sentiment = prediction
    current_document.save()  
    print(prediction) 

In [12]:
request_data = []

for document in NewsArticles.objects[45000:50000]:
  request_data.append(document)
  

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
  result = executor.map(sentiment_funtion, request_data)

{'sentiment': 'positive', 'confidence': "tensor([0.9999], device='cuda:0')", 'probabilities': {'negative': 1.829457687563263e-05, 'neutral': 5.882987170480192e-05, 'positive': 0.9999228715896606}}
{'sentiment': 'negative', 'confidence': "tensor([0.9997], device='cuda:0')", 'probabilities': {'negative': 0.9997181296348572, 'neutral': 0.00017941888654604554, 'positive': 0.000102474499726668}}
{'sentiment': 'negative', 'confidence': "tensor([0.9999], device='cuda:0')", 'probabilities': {'negative': 0.9998505115509033, 'neutral': 9.369401959702373e-05, 'positive': 5.583728852798231e-05}}
{'sentiment': 'positive', 'confidence': "tensor([0.9859], device='cuda:0')", 'probabilities': {'negative': 0.013880875892937183, 'neutral': 0.00020820698409806937, 'positive': 0.9859108924865723}}
{'sentiment': 'positive', 'confidence': "tensor([0.9999], device='cuda:0')", 'probabilities': {'negative': 2.5938388716895133e-05, 'neutral': 5.468544259201735e-05, 'positive': 0.9999192953109741}}
{'sentiment': 