# MSCA 32018 Natural Language Processing and Cognitive Computing
## Final Project - Targeted (Entity) Sentiment Identification


Shijia Huang

-----

In [1]:
#!pip install -r requirements.txt

In [2]:
# Import basic libraries
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [4]:
# Import NLP libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from pprint import pprint
import string
from rake_nltk import Rake

import eli5
import pickle

import spacy
from spacy import displacy
from spacy.util import minibatch, compounding
spacy.prefer_gpu()
print(spacy.__version__)

import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim as gensimvis
#import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

2023-05-21 19:51:12.240133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.5.2


In [5]:
# Import sklearn libraries
import sklearn
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [6]:
import multiprocessing as mp

num_processors = mp.cpu_count()
print(f'Available CPUs: {num_processors}')

Available CPUs: 12


## Read Data

In [7]:
%%time

# read in the sentiment data
path = "gs://nlp-final-project-data/data/"
df_news = pd.read_parquet(path + 'news_sentiment.parquet', engine='pyarrow')
df_news.shape

CPU times: user 1min 15s, sys: 30.8 s, total: 1min 46s
Wall time: 2min 57s


(154283, 11)

In [8]:
df_news.head(2)

Unnamed: 0,id,date,cleaned title,cleaned text,title_tokens,title_lemmatized,text_tokens,text_lemmatized,title_keywords,text_keywords,sentiment
0,1,2020-02-27,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot,"Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot admin Latest posts by admin see all Mansplaining in conferences: How can we get him to forestall February 27, 2020 Coronavirus Could Explode in the U.S. Overnight Like it Did in Italy February 27, 2020 Levi Strauss marks the next phase in corporate paid leave policies February 27, 2020 Scientists who designed an artificially clever robotic that helped youngsters with autism spice up their ...","[children, autism, saw, learning, social, skills, boosted, playing, ai, robot, children, autism, saw, learning, social, skills, boosted, playing, ai, robot, children, autism, saw, learning, social, skills, boosted, playing, ai_robot]","[child, autism, see, learn, social, skill, boost, play, robot, child, autism, see, learn, social, skill, boost, play, robot, child, autism, see, learn, social, skill, boost, play, ai_robot]","[children, autism, saw, learning, social, skills, boosted, playing, ai, robot, admin, latest, posts, admin, see, mansplaining, conferences, get, forestall, february, coronavirus, could, explode, overnight, like, italy, february, levi, strauss, marks, next, phase, corporate, paid, leave, policies, february, scientists, designed, artificially, clever, robotic, helped, youngsters, autism, spice, studying, social, talents, hope, era, may, future, help, others, developmental, dysfunction, learn, ...","[child, autism, see, learn, social, skill, boost, play, robot, late, post, admin, see, mansplaining, conference, get, explode, overnight, mark, next, phase, corporate, pay, leave, policy, scientist, design, artificially, clever, robotic, help, youngster, autism, spice, study, social, talent, era, future, help, other, developmental, dysfunction, learn, notice, youngster, gentle, average, autism, take, domestic, s, refer, socially, assistive, robotic, name, kiwi, month, accord, commentary, way...","[social, skill, see, play, learn, child, boost, autism, robot, ai_robot]","[robotic, youngster, kid, child, kiwi, market, autism, learn, crew, talent]",5
1,2,2021-03-26,"Forget ML, AI and Industry 4.0 – obsolescence should be your focus","Forget ML, AI and Industry 4.0 obsolescence should be your focus The world entered a new era of accelerated transformation in the last eighteen months that will continue to evolve and press forward for years to come. Most businesses are playing catchup trying to make sense of a new timeline where the ten years that had been set aside for careful planning and implementation of what was coming up next no longer exists. The next is happening now and, regardless of your industry or seniority, t...","[forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus]","[forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus, forget, ml, ai, industry, obsolescence, focus]","[forget, ml, ai, industry, obsolescence, focus, world, entered, new, era, accelerated, transformation, last, eighteen, months, continue, evolve, press, forward, years, come, businesses, playing, catchup, trying, make, sense, new, timeline, ten, years, set, aside, careful, planning, implementation, coming, next, longer, exists, next, happening, regardless, industry, seniority, status, quo, shifted, better, face, back, invited, attend, pompous, meeting, london, brazilian, embassy, along, selec...","[forget, ai, industry, obsolescence, focus, world, enter, new, era, accelerate, transformation, last, month, continue, evolve, press, forward, year, come, business, play, catchup, try, make, sense, new, timeline, year, set, aside, careful, planning, implementation, come, next, long, exist, next, happen, regardless, industry, seniority, status, quo, shift, well, face, back, invite, attend, pompous, meeting, brazilian, embassy, select, lead, name, oil, energy, industry, get, update, go, happen...","[obsolescence, ml, industry, forget, focus, ai]","[electronic, come, card, industry, repair, new, system, require, test, business]",4


In [9]:
# remove links and special characters
def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'&\S+', '', text)
    text =  re.sub(r'\S+.com', '', text)
    text =  re.sub(r'\S+.ca', '', text)
    text =  re.sub(r'\S+.org', '', text)
    text = re.sub(r'[^a-zA-Z0-9 @ . , : - _]', '', text)
    return text

In [10]:
# clean the text
df_news['cleaned text'] = df_news['cleaned text'].apply(lambda x: remove_links(x))

In [11]:
### SAMPLE DATA
df_news = df_news.sample(frac=0.01, random_state=42)
df_news.shape

(1543, 11)

In [12]:
%%time

# read in the entities data
path = "gs://nlp-final-project-data/result/"
AI_entities = pd.read_csv(path + "AI_entities.csv")
ORG_entities = pd.read_csv(path + "ORG_entities.csv")

CPU times: user 17.6 ms, sys: 61.4 ms, total: 79 ms
Wall time: 684 ms


In [13]:
AI_entities

Unnamed: 0,Entities,Labels,count
0,ChatGPT,ORG,136885
1,ML,ORG,24115
2,Machine Learning,ORG,17099
3,Bard,ORG,14528
4,GPT3,ORG,8427
5,GPT,ORG,7565
6,AIgenerated,ORG,6998
7,Bing,PERSON,8556
8,Vectorspace AI,PERSON,1784
9,Ernie Bot,PERSON,1705


In [14]:
ORG_entities

Unnamed: 0,Entities,Labels,count
0,Google,ORG,80971
1,Microsoft,ORG,67475
2,OpenAI,ORG,32015
3,IBM,ORG,25276
4,Facebook,ORG,19353
5,NVIDIA,ORG,19219
6,Amazon,ORG,17679
7,Googles,ORG,14072
8,Apple,ORG,12477
9,Twitter,ORG,11906


In [12]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 587.7/587.7 MB 1.8 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [24]:
from utils import process_text_sen_sentiment
nlp = spacy.load('en_core_web_lg')

In [25]:
# function to extract entities from news text with sentence segmentation and sentiment
def extract_entities_sentiment(df):
    with mp.Pool() as pool:
        results = pool.imap_unordered(process_text_sen_sentiment, df['cleaned text'], chunksize=2000)
        for i, ents in enumerate(results):
            for ent in ents:
                yield df.iloc[i]['id'], ent[0], ent[1], ent[2], ent[3]

In [26]:
%%time

ner_sentiment_df = pd.DataFrame(extract_entities_sentiment(df_news), columns=['id', 'Entities', 'Labels', 'Sentiment', 'Subjectivity'])
ner_sentiment_df

CPU times: user 7.72 s, sys: 409 ms, total: 8.13 s
Wall time: 6min 47s


Unnamed: 0,id,Entities,Labels,Sentiment,Subjectivity
0,6559,Social GoodDeepali KhannaContributorOpinions,LOC,0.053333,0.091667
1,6559,Forbes Contributors,ORG,0.053333,0.091667
2,6559,AsiaI,ORG,0.053333,0.091667
3,6559,FacebookShare,ORG,0.053333,0.091667
4,6559,TwitterShare,ORG,0.053333,0.091667
...,...,...,...,...,...
120901,185558,ChatGPT,ORG,-0.031250,0.435417
120902,185558,Italys,GPE,-0.031250,0.435417
120903,185558,FBI,ORG,-0.031250,0.435417
120904,185558,headquartersCanada,PERSON,-0.031250,0.435417


In [27]:
%%time

# save dataframe to csv
path = "gs://nlp-final-project-data/result/"
ner_sentiment_df.to_csv(path + 'ner_sentiment_df.csv', index=False)

CPU times: user 524 ms, sys: 82.1 ms, total: 606 ms
Wall time: 5.67 s


## ORG Targeted Sentiment Analysis

## AI Solutions Targeted Sentiment Analysis