In [None]:
!pip install plotly==5.8.0
!pip install flair
!pip install pyyaml==5.4.1
!pip install pep8

In [None]:
### Import packages 
import json
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import plotly.express as px
import spacy
nlp=spacy.load('en_core_web_sm')
from nltk.sentiment import SentimentIntensityAnalyzer
import operator
import nltk
nltk.download('vader_lexicon')
from flair.models import TextClassifier
from flair.data import Sentence
from textblob import TextBlob
from tqdm import tqdm

  defaults = yaml.load(f)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Parameter 

In [None]:
### parameter 
### URL from which data need to be pulled 
url = "https://www.aljazeera.com/where/mozambique/"

### Methods

In [None]:
### Get the data from the URL 
def get_url_data(url): 
  data = requests.request("GET", url) ### get request to fetch the data for the URL 
  data_soup = BeautifulSoup(data.content, 'html.parser') ### coverting the data into BeautifulSoup format for easy access of data

  return data_soup

### Conver the text data into lower case
def convert_to_lower_case(dataset):
    def lower(input_text):
        return input_text.lower()
    dataset['headline']=dataset['headline'].apply(lower)
    dataset['article']=dataset['article'].apply(lower)
  
### Remove punctuation
def remove_punctuation(dataset):
    def remove_punctuation_from_text(input_text):
        output_list=[word for word in input_text.split() if word.isalpha()]
        return ' '.join(output_list)    
    dataset['headline']=dataset['headline'].apply(remove_punctuation_from_text)
    dataset['article']=dataset['article'].apply(remove_punctuation_from_text)

### Correct the words 
def correct_words(dataset):
    def correct_text(input_text):
        list_1=[str(TextBlob(word).correct()) for word in input_text.split()]
        output_text= ' '.join(list_1)
        return output_text
    dataset['headline']=dataset['headline'].apply(correct_text)
    dataset['article']=dataset['article'].apply(correct_text)

### Going to the root word   
def lemmatize(dataset):
    def lematize_text(input_text):
        doc=nlp(input_text)
        lemmas=[token.lemma_ for token in doc]
        output_text=' '.join(lemmas)
        return output_text
    dataset['headline']=dataset['headline'].apply(lematize_text)
    dataset['article']=dataset['article'].apply(lematize_text)

### removing stop words 
def remove_stopwords(dataset):
    def remove_stopwords_from_text(input_text):
        stopwords=spacy.lang.en.stop_words.STOP_WORDS
        output_list=[word for word in input_text.split() if word not in stopwords and not(word=='-PRON-') ]
        return ' '.join(output_list)
    dataset['headline']=dataset['headline'].apply(remove_stopwords_from_text)
    dataset['article']=dataset['article'].apply(remove_stopwords_from_text)

### Code Flow

In [None]:
### Call get_url_data 
main_page_soup = get_url_data(url)

In [None]:
### All News Links 
News_links = main_page_soup.find_all('a', class_='u-clickable-card__link')

In [None]:
### Modify the links, add https://www.aljazeera.com in prefix of links
top_ten_news = []
for link in News_links: 
  top_ten_news.append("https://www.aljazeera.com" + link["href"])

In [None]:
### Get data from top ten NEWS link
article = []
headline = []
count = 0

for link in tqdm(top_ten_news):
  news_data = get_url_data(link)
  info = news_data.find_all('div', class_='wysiwyg wysiwyg--all-content css-1ck9wyi')
  res = news_data.find('script')
  
  if len(info) != 0:
    article.append(info[0].get_text())
    json_object = json.loads(res.contents[0])
    headline.append(json_object['headline'])

    count += 1
    if count == 10: ### Filtring the top 10 working articles 
      break

 71%|███████▏  | 10/14 [00:04<00:01,  2.47it/s]


In [None]:
### Making a dataFrame
df = pd.DataFrame({"headline": headline, "article": article })

In [None]:
### Saving the article in a json file 
df[['headline', 'article']].to_json("News.json")

### Data cleaning

In [None]:
convert_to_lower_case(df)
remove_punctuation(df)
lemmatize(df)
remove_stopwords(df)

In [None]:
### Combining Headline and article 
df["news_data"] = df["headline"] + " " + df["article"]

### Sentiment Analysis using SentimentIntensityAnalyzer 

In [None]:
df_copy1 = df.copy()
df_copy2 = df.copy()
df_copy3 = df.copy()

In [None]:
sia = SentimentIntensityAnalyzer()
df_copy1["sentiment_score"] = df_copy1["headline"].apply(lambda x: sia.polarity_scores(x)["compound"])
df_copy1["sentiment"] = np.select([df_copy1["sentiment_score"] < 0, df_copy1["sentiment_score"] == 0, df_copy1["sentiment_score"] > 0],['neg', 'neu', 'pos'])

In [None]:
print("SentimentIntensityAnalyzer")
df_copy1

SentimentIntensityAnalyzer


Unnamed: 0,headline,article,news_data,sentiment_score,sentiment
0,flood hit south province,heavy rain south africa force people flee home...,flood hit south province heavy rain south afri...,0.0,neu
1,cyclone gombe death toll rise,tropical cyclone gombe kill people hit mozambi...,cyclone gombe death toll rise tropical cyclone...,-0.5994,neg
2,mozambique announce new prime minister cabinet...,mozambique president filipe nyusi appoint new ...,mozambique announce new prime minister cabinet...,0.0,neu
3,african gas replace russian supply,nigeria february german chancellor olaf scholz...,african gas replace russian supply nigeria feb...,0.0,neu
4,dozen dead tropical storm ana southern africa,death toll storm strike southern african count...,dozen dead tropical storm ana southern africa ...,-0.6486,neg
5,southern africa bloc sadc extend mozambique mi...,southern african regional bloc sadc summit ext...,southern africa bloc sadc extend mozambique mi...,0.1779,pos
6,kagame rwandan work,rwandan president paul kagame arrive deploy so...,kagame rwandan work rwandan president paul kag...,0.0,neu
7,mozambique force recapture port city rebel,rwandan force deploy month help army battle re...,mozambique force recapture port city rebel rwa...,-0.1531,neg
8,rwanda deploy soldier cabo delgado,rwanda start deploy force mozambique help comb...,rwanda deploy soldier cabo delgado rwanda star...,0.0,neu
9,southern african nation agree deploy force moz...,southern african development community agree d...,southern african nation agree deploy force moz...,0.3612,pos


### Sentiment Analysis using TextBlob 

In [None]:
df_copy2["sentiment_score"] = df_copy2["headline"].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df_copy2["sentiment"] = np.select([df_copy2["sentiment_score"] < 0, df_copy2["sentiment_score"] == 0, df_copy2["sentiment_score"] > 0],
                           ['neg', 'neu', 'pos'])

In [None]:
print("TextBlob")
df_copy2

TextBlob


Unnamed: 0,headline,article,news_data,sentiment_score,sentiment
0,flood hit south province,heavy rain south africa force people flee home...,flood hit south province heavy rain south afri...,0.0,neu
1,cyclone gombe death toll rise,tropical cyclone gombe kill people hit mozambi...,cyclone gombe death toll rise tropical cyclone...,0.0,neu
2,mozambique announce new prime minister cabinet...,mozambique president filipe nyusi appoint new ...,mozambique announce new prime minister cabinet...,0.136364,pos
3,african gas replace russian supply,nigeria february german chancellor olaf scholz...,african gas replace russian supply nigeria feb...,0.0,neu
4,dozen dead tropical storm ana southern africa,death toll storm strike southern african count...,dozen dead tropical storm ana southern africa ...,-0.1,neg
5,southern africa bloc sadc extend mozambique mi...,southern african regional bloc sadc summit ext...,southern africa bloc sadc extend mozambique mi...,0.0,neu
6,kagame rwandan work,rwandan president paul kagame arrive deploy so...,kagame rwandan work rwandan president paul kag...,0.0,neu
7,mozambique force recapture port city rebel,rwandan force deploy month help army battle re...,mozambique force recapture port city rebel rwa...,0.0,neu
8,rwanda deploy soldier cabo delgado,rwanda start deploy force mozambique help comb...,rwanda deploy soldier cabo delgado rwanda star...,0.0,neu
9,southern african nation agree deploy force moz...,southern african development community agree d...,southern african nation agree deploy force moz...,0.0,neu


In [None]:
sia = TextClassifier.load('en-sentiment')
def flair_prediction(x):
    sentence = Sentence(x)
    sia.predict(sentence)
    score = sentence.labels[0]
    if "POSITIVE" in str(score):
        return "pos"
    elif "NEGATIVE" in str(score):
        return "neg"
    else:
        return "neu"
df_copy3["sentiment"] = df_copy3["news_data"].apply(flair_prediction)

2022-06-09 02:04:37,481 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


In [None]:
print("flair_prediction")
df_copy3

flair_prediction


Unnamed: 0,headline,article,news_data,sentiment
0,flood hit south province,heavy rain south africa force people flee home...,flood hit south province heavy rain south afri...,neg
1,cyclone gombe death toll rise,tropical cyclone gombe kill people hit mozambi...,cyclone gombe death toll rise tropical cyclone...,neg
2,mozambique announce new prime minister cabinet...,mozambique president filipe nyusi appoint new ...,mozambique announce new prime minister cabinet...,neg
3,african gas replace russian supply,nigeria february german chancellor olaf scholz...,african gas replace russian supply nigeria feb...,pos
4,dozen dead tropical storm ana southern africa,death toll storm strike southern african count...,dozen dead tropical storm ana southern africa ...,neg
5,southern africa bloc sadc extend mozambique mi...,southern african regional bloc sadc summit ext...,southern africa bloc sadc extend mozambique mi...,pos
6,kagame rwandan work,rwandan president paul kagame arrive deploy so...,kagame rwandan work rwandan president paul kag...,neg
7,mozambique force recapture port city rebel,rwandan force deploy month help army battle re...,mozambique force recapture port city rebel rwa...,pos
8,rwanda deploy soldier cabo delgado,rwanda start deploy force mozambique help comb...,rwanda deploy soldier cabo delgado rwanda star...,pos
9,southern african nation agree deploy force moz...,southern african development community agree d...,southern african nation agree deploy force moz...,neg


### Visualization

In [None]:
x = df_copy1['sentiment'].value_counts()

In [None]:
fig = px.bar(x)
fig.show()

In [None]:
x = df_copy2['sentiment'].value_counts()
fig = px.bar(x)
fig.show()

In [None]:
x = df_copy3['sentiment'].value_counts()
fig = px.bar(x)
fig.show()