# Part 1 (Sentiment analysis):

In [None]:
#vader sentiment analysis package, transformers, pytorch
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline
import torch
import nltk

In [None]:
def sentiment_sent(sentence):
    VSA = SentimentIntensityAnalyzer()
    sentiment = VSA.polarity_scores(sentence)
    label ='P' if sentiment['compound'] >= 0.05 else ('N' if sentiment['compound'] <= -0.05 else None) #when -0.05<compound<0.05, it is neutral
    if label:
        with open('Sentiment Analysis.txt','a') as f:
            f.write(f'{sentence} {label}\n')

In [None]:
#prepare a list of 100 sentences,compare the results of Naive Bayes/Vader and the finetuned BERT model
with open('en.txt','r',encoding='utf8') as f:
    content = f.read()
content = (' '.join(content.split())).replace('\n',' ')
sent = nltk.sent_tokenize(content)
sample_sent = sent[1:101]

# BERT Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
sentiment_pipeline = pipeline("sentiment-analysis",model="distilbert-base-uncased-finetuned-sst-2-english",device=device)
def BERT_sentiment(text):
    sentiment = sentiment_pipeline(text)
    label = sentiment[0]['label']
    return 'P' if label == 'POSITIVE' else 'N'

#Vader sentiment
def Vader_sentiment(text):
    VSA = SentimentIntensityAnalyzer()
    sentiment = VSA.polarity_scores(text)
    return 'P' if sentiment['compound'] >= 0.05 else ('N' if sentiment['compound'] <= -0.05 else 'Neutral')


In [50]:
BERT_analysis = [BERT_sentiment(sentence) for sentence in sample_sent]
Vader_analysis = [Vader_sentiment(sentence) for sentence in sample_sent]
with open('sent_comp.txt','a') as f:
    f.write('Sentence, ResNB/Vader, ResBERT\n')
    for i in range(len(sample_sent)):
        f.write(f'"{sample_sent[i]}",{Vader_analysis[i]},{BERT_analysis[i]}\n')

In [None]:
#Calculate how often the two models agree
agree, total = 0, 0
for i in range(len(sample_sent)):
    if Vader_analysis[i] == 'Neutral':
        continue
    if Vader_analysis[i] == BERT_analysis[i]:
        agree += 1
    total += 1

print(f'Number of agreement: {agree}')
print(f'Ratio of Agreement: {agree/len(sample_sent)}')
print(f'Ratio of Agreement(Excluding Neutral Results): {agree/total}')


Number of agreement: 51
Ratio of Agreement: 0.51
Ratio of Agreement(Excluding Neutral Results): 0.6375


In [None]:
#check which model provide the right response
agree_cases = []
diff_cases = []
for i in range(len(sample_sent)):
    if Vader_analysis[i] == 'Neutral':
        continue
    if Vader_analysis[i] == BERT_analysis[i]:
        agree_cases.append((sample_sent[i],Vader_analysis[i],BERT_analysis[i]))
    else:
        diff_cases.append((sample_sent[i],Vader_analysis[i],BERT_analysis[i]))

In [17]:
import random
print('Samples of Agree Case: ')
random.sample(agree_cases,5)

Samples of Agree Case: 


[('Lucy Graham was not looking at Sir Michael, but straight out into the misty twilight and dim landscape far away beyond the little garden.',
  'P',
  'P'),
 ('To the right there were the kitchen gardens, the fish-pond, and an orchard bordered by a dry moat, and a broken ruin of a wall, in some places thicker than it was high, and everywhere overgrown with trailing ivy, yellow stonecrop, and dark moss.',
  'N',
  'N'),
 ('But this reference was so satisfactory that none other was needed, and Miss Lucy Graham was received by the surgeon as the instructress of his daughters.',
  'P',
  'P'),
 ('What had been his love for his first wife but a poor, pitiful, smoldering spark, too dull to be extinguished, too feeble to burn?',
  'N',
  'N'),
 ('Often in the cool of the evening Sir Michael Audley would stroll up and down smoking his cigar, with his dogs at his heels, and his pretty young wife dawdling by his side; but in about ten minutes the baronet and his companion would grow tired of th

In [20]:
print('Samples of Different Judgement Case: ')
random.sample(diff_cases,5)

Samples of Different Judgement Case: 


[('The truth was that Lady Audley had, in becoming the wife of Sir Michael, made one of those apparently advantageous matches which are apt to draw upon a woman the envy and hatred of her sex.',
  'N',
  'P'),
 ('"I scarcely think there is a greater sin, Lucy," he said, solemnly, "than that of a woman who marries a man she does not love.',
  'N',
  'P'),
 ('It pained him too much to believe for a moment that any one so lovely and innocent could value herself against a splendid house or a good old title.',
  'P',
  'N'),
 ('"You unlucky, my dear!"', 'P', 'N'),
 ('There was nothing whatever in her manner that betrayed the shallow artifices employed by a woman who wishes to captivate a rich man.',
  'P',
  'N')]

##### Analysis
1. 'Of course it would be a magnificent match; he has a splendid income, and is one of the most generous of men.'  
    
    When the sentence is simple and straightforward, both models make the right analysis that it is positive. The sentence is straight-forward and contains words explicitly express compliment. So it would be easy for both models to make a right judgement.

Even if the models agree, it does not mean they provide the right analysis. For example, in sentence:

2. 'A spot in which peace seemed to have taken up her abode, setting her soothing hand on every tree and flower, on the still ponds and quiet alleys, the shady corners of the old-fashioned rooms, the deep window-seats behind the painted glass, the low meadows and the stately avenues—ay, even upon the stagnant well, which, cool and sheltered as all else in the old place, hid itself away in a shrubbery behind the gardens, with an idle handle that was never turned and a lazy rope so rotten that the pail had broken away from it, and had fallen into the water.'  
    
    Both models think it is negative, but the sentiment of this sentence is positive. Both models have limitations in dealing with complex sentences in abstract or ambiguous context.

3. 'Of course, in such a house there were secret chambers; the little daughter of the present owner, Sir Michael Audley, had fallen by accident upon the discovery of one.'   
    
    Both models think this sentence is negative. However, it is only a neutral description. They might have mistakenly associated 'secret chambers' with 
some negative associations.

When the two models make a different judgement:

4. ‘It had been of good service in its time, no doubt; and busy nuns have perhaps drawn the cool water with their own fair hands; but it had fallen into disuse now, and scarcely any one at Audley Court knew whether the spring had dried up or not.’  
    
    The Vader thinks the sentence is positive while the finetuned BERT model thinks it is negative. In this case, the finetuned BERT model makes the right judgement. The Vader model mainly relies on the emotions of each words in the sentences with semantic rules.Therefore, it may decide the sentence is positive when the sentence contains more obviously positive words in the sentence. The finetuned BERT model analyse sentiment based on the context of the whole sentence and thus identify the emotional change in the sentence.

5. ‘The truth was that Lady Audley had, in becoming the wife of Sir Michael, made one of those apparently advantageous matches which are apt to draw upon a woman the envy and hatred of her sex.’

    The Vader thinks the sentence is negative while the finetuned BERT model thinks it is positive. From my perspective, the Vader made the right judgement. As the sentence contains 'envy' and 'hatred', which clearly connected to strong negative emotions, thus the Vader tends to judge it as negative. The BERT model judged it as positive as it understand context and its judgement is influenced by the 'apparently advantageous matches'.

# Part 2 (Text classification)
perform text classification using scikit-learn
check whether named entities can positively contribute to a better classification performance.

In [21]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

cats = ['alt.atheism', 'sci.space', 'comp.graphics']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)

len(newsgroups_test['data']),len(newsgroups_train['data'])

(1102, 1657)

In [22]:
#This whole cell is just to get a smaller subset of the corpus

whole_train_instances = newsgroups_train['data']
whole_y_train = [newsgroups_train['target_names'][el] for el in newsgroups_train['target']]

train_instances, _, y_train, _ = train_test_split(whole_train_instances, whole_y_train, train_size=0.15)

whole_test_instances = newsgroups_test['data']
whole_y_test = [newsgroups_test['target_names'][el] for el in newsgroups_test['target']]

test_instances, _, y_test, _ = train_test_split(whole_test_instances, whole_y_test, train_size=0.15)

len(train_instances), len(y_train), len(test_instances), len(y_test)

(248, 248, 165, 165)

In [None]:
#get a smaller subset of the corpus, as this part can be quite computationally expensive.

from sklearn.model_selection import train_test_split

whole_train_instances = newsgroups_train['data']
whole_y_train = [newsgroups_train['target_names'][el] for el in newsgroups_train['target']]

train_instances, _, y_train, _ = train_test_split(whole_train_instances, whole_y_train, train_size=0.25)

whole_test_instances = newsgroups_test['data']
whole_y_test = [newsgroups_test['target_names'][el] for el in newsgroups_test['target']]

test_instances, _, y_test, _ = train_test_split(whole_test_instances, whole_y_test, train_size=0.15)

len(train_instances), len(y_train), len(test_instances), len(y_test)

(414, 414, 165, 165)

In [None]:
#Vectorize the news articles and train a Logistic Regression model on the training data.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vectorizer = CountVectorizer()
whole_train_countmatrix = vectorizer.fit_transform(whole_train_instances)
whole_test_countmatrix = vectorizer.transform(whole_test_instances)

lr_model = LogisticRegression(random_state=442).fit(whole_train_countmatrix, whole_y_train)
y_pred = lr_model.predict(whole_test_countmatrix)

print("-----------------Classification Report-----------------")
print(f"{classification_report(whole_y_test, y_pred,target_names = newsgroups_train['target_names'])}")

-----------------Classification Report-----------------
               precision    recall  f1-score   support

  alt.atheism       0.96      0.92      0.94       319
comp.graphics       0.92      0.96      0.94       389
    sci.space       0.95      0.94      0.94       394

     accuracy                           0.94      1102
    macro avg       0.94      0.94      0.94      1102
 weighted avg       0.94      0.94      0.94      1102



In [45]:
#using subset(in order to compare with the third exercise)
vectorizer2 = CountVectorizer()
train_instances_countmatrix = vectorizer.fit_transform(train_instances)
test_instances_countmatrix = vectorizer.transform(test_instances)

lr_model_sub = LogisticRegression(random_state=442).fit(train_instances_countmatrix, y_train)
y_pred2 = lr_model_sub.predict(test_instances_countmatrix)

print("-------------Classification Report(subset)-------------")
print(f"{classification_report(y_test, y_pred2,target_names = newsgroups_train['target_names'])}")

-------------Classification Report(subset)-------------
               precision    recall  f1-score   support

  alt.atheism       0.93      0.88      0.90        48
comp.graphics       0.86      0.89      0.88        55
    sci.space       0.86      0.87      0.86        62

     accuracy                           0.88       165
    macro avg       0.88      0.88      0.88       165
 weighted avg       0.88      0.88      0.88       165



In [None]:
#normalize the frequencies with the inverse document frequency
#using the whole train and test set
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
whole_train_tfidf = tfidf_vec.fit_transform(whole_train_instances)
whole_test_tfidf = tfidf_vec.transform(whole_test_instances)


lr_model2 = LogisticRegression(random_state=442).fit(whole_train_tfidf, whole_y_train)
y_pred_tfidf = lr_model2.predict(whole_test_tfidf)

print("-----------------Classification Report-----------------")
print(f"{classification_report(whole_y_test, y_pred_tfidf,target_names = newsgroups_train['target_names'])}")


-----------------Classification Report-----------------
               precision    recall  f1-score   support

  alt.atheism       0.98      0.92      0.95       319
comp.graphics       0.91      0.96      0.93       389
    sci.space       0.95      0.94      0.95       394

     accuracy                           0.94      1102
    macro avg       0.94      0.94      0.94      1102
 weighted avg       0.94      0.94      0.94      1102



In [47]:
#using subset
tfidf_vec2 = TfidfVectorizer()
train_instances_tfidf = tfidf_vec2.fit_transform(train_instances)
test_instances_tfidf = tfidf_vec2.transform(test_instances)

lr_model_sub2 = LogisticRegression(random_state=442).fit(train_instances_tfidf, y_train)
y_pred_tfidf2 = lr_model_sub2.predict(test_instances_tfidf)

print("-------------Classification Report(subset)-------------")
print(f"{classification_report(y_test, y_pred_tfidf2,target_names = newsgroups_train['target_names'])}")

-------------Classification Report(subset)-------------
               precision    recall  f1-score   support

  alt.atheism       0.96      0.94      0.95        48
comp.graphics       0.92      0.89      0.91        55
    sci.space       0.91      0.95      0.93        62

     accuracy                           0.93       165
    macro avg       0.93      0.93      0.93       165
 weighted avg       0.93      0.93      0.93       165



In [None]:
#extract named entities and check whether this improve results
import spacy
model = spacy.load('en_core_web_sm')

def gen_named_entity(text):
    doc = model(text)
    name_entity = [ent.text.replace(" ","_") for ent in doc.ents if len(ent.text.split())>1]
    return " ".join(name_entity)

train_instances_processed = [doc + " " + gen_named_entity(doc) for doc in train_instances]
test_instances_processed = [doc + " " + gen_named_entity(doc) for doc in test_instances]

vec = CountVectorizer()
train_instances_vec = vec.fit_transform(train_instances_processed)
test_instances_vec = vec.transform(test_instances_processed)

lr_model3 = LogisticRegression(random_state=442).fit(train_instances_vec, y_train)
y_pred_vec = lr_model3.predict(test_instances_vec)

print("-----------------Classification Report-----------------")
print(f"{classification_report(y_test, y_pred_vec,target_names = newsgroups_train['target_names'])}")


-----------------Classification Report-----------------
               precision    recall  f1-score   support

  alt.atheism       0.93      0.90      0.91        48
comp.graphics       0.86      0.89      0.88        55
    sci.space       0.87      0.87      0.87        62

     accuracy                           0.88       165
    macro avg       0.89      0.89      0.89       165
 weighted avg       0.89      0.88      0.89       165

