In [None]:
!pip install newspaper3k
!pip install feedparser
!pip install nltk
!pip install plotly 

### Web Scraping

In [None]:
import newspaper
import feedparser
import pandas as pd

def scrape_news_from_feed(feed_url):
    articles = []
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        # create a newspaper article object
        article = newspaper.Article(entry.link)
        # download and parse the article
        article.download()
        article.parse()
        # check if the article is about Bitcoin
        if "Bitcoin" in article.text:
            # extract relevant information
            articles.append({
              'title': article.title,
              'author': article.authors,
              'publish_date': article.publish_date,
              'link': article.url,
              'content': article.text
            })
    return articles

# list of feed URLs and sources
feed_urls = [
    ('https://www.coindesk.com/arc/outboundfeeds/rss/'),
    ('https://cointelegraph.com/rss'),
    ('https://rss.app/feeds/tI7GiY7M29sh0nlt.xml'),  #coingape
    ('https://rss.app/feeds/w2bLoXDSF0AoaQME.xml'),  #utoday
    ('https://rss.app/feeds/pWnwnLdxVlX273wi.xml'),  #coinmarketcap
    ('https://rss.app/feeds/GlP7JzjkGqZbR76J.xml'),  #dailycoin
    ('https://rss.app/feeds/yxAe1V9hz3aUaDhj.xml'),  #newsbtc
    ('https://rss.app/feeds/swkPy5nTNNcNnzMI.xml') 
 ]

# empty list to store all articles
articles = []

# loop through the feed URLs and scrape news from each feed
for feed_url in feed_urls:
    articles += scrape_news_from_feed(feed_url)

# check if the articles list is empty
if not articles:
    print("No articles found.")
else:
    # create a DataFrame from the articles list
    news_df = pd.DataFrame(articles)
    print(news_df.head())

    
'''
Method has been change in searching of terms. Instead to look in title, term will be looked up in the text. Target value may be changed.
However, there is consideration in code for method to be changed. Target term could be found after the exeution of the scrap function in pandas df, to filter out info

'''

In [None]:
news_df.head(10)

In [None]:
news_df['content']

In [None]:
#formating in codespace doesnt display in full (try in jupyter notebook)
pd.set_option('display.max_colwidth', None)
print(news_df['content'])

In [9]:
#reverse full content display   
pd.reset_option('display.max_colwidth')

### Preprocessing

In [None]:
import nltk
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# create lemmatizer and stemmer objects
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# define the POS tags you want to keep (part of speech)
keep_pos = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS']

# define a function to filter out words with unwanted POS tags and exclude punctuation marks
def filter_words(text):
    sentences = sent_tokenize(text.lower())
    result = []
    for sent in sentences:
        words = word_tokenize(sent)
        words_pos = pos_tag(words)
        filtered_words = [word for word, pos in words_pos if pos in keep_pos and word != 'is' and word.isalpha()]
        result.append(' '.join(filtered_words).replace('\n\n', ''))
    return ' '.join(result)

# apply the filter_words function to the 'content' column
news_df['clean_content'] = news_df['content'].apply(filter_words)

# verify DF
news_df.head()

### The most frequent word

In [None]:
import nltk
from nltk import FreqDist


# tokenize the text and calculate frequency distribution for each row
news_df['freq_dist'] = news_df['clean_content'].apply(lambda x: FreqDist(nltk.tokenize.word_tokenize(x)))

# get the most frequent word and its frequency for each row
news_df['most_common'] = news_df['freq_dist'].apply(lambda x: x.most_common(1)[0])

# separate the most frequent word and its frequency into separate columns
news_df[['most_common_word', 'frequency']] = pd.DataFrame(news_df['most_common'].tolist(), index=news_df.index)

# drop intermediate columns
news_df.drop(['freq_dist', 'most_common'], axis=1, inplace=True)

# print the resulting DataFrame
print(news_df.head())


In [None]:
#check the most frequent word in descending order
news_df.sort_values(by='frequency', ascending=False)

In [None]:
#check for duplicate indices
news_df.duplicated().sum()

In [None]:
#remove duplcates 
news_df = news_df.drop_duplicates()
#verify
news_df.duplicated().sum()

In [None]:
#top 10 frequent words
news_df_the_most = news_df.sort_values(by = 'frequency', ascending = False)
news_df_the_most = news_df_the_most.head(10)
news_df_the_most

In [None]:
#plot it
import plotly.express as px
fig = px.bar(news_df_the_most, x="most_common_word", y="frequency", color="most_common_word", text_auto=True)
fig.show()

### The most frequent Bigram and Trigram terms

In [None]:
#Lets form bigram and trigrams
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
from sklearn.feature_extraction.text import CountVectorizer
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(news_df['clean_content'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})

In [None]:
#top 20 bigram/trigram 
news_df_the_bigram_trigram = df_ngram.sort_values(by = 'frequency', ascending = False)
news_df_the_bigram_trigram = news_df_the_bigram_trigram.head(20)
news_df_the_bigram_trigram

In [None]:
#plot 
import plotly.express as px
fig = px.bar(news_df_the_bigram_trigram, x="bigram/trigram", y="frequency", color="bigram/trigram", text_auto=True)
fig.show()


### Vader Sentiment Analysis

In [None]:
#import VSA
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#create an object
sentiments = SentimentIntensityAnalyzer()
#create a new column and for each of the category, rate the content. Define the word count per content
news_df["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in news_df["clean_content"]]
news_df["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in news_df["clean_content"]]
news_df["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in news_df["clean_content"]]
news_df['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in news_df["clean_content"]]
news_df['Word_Count'] = news_df['clean_content'].apply(lambda s : len(s.split(' ')))
print(news_df.head())


In [None]:
#add labels for vader sentiment scoring, reference https://github.com/cjhutto/vaderSentiment

score = news_df["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
news_df["Sentiment"] = sentiment

news_df.head()

In [None]:
#count function
import plotly.express as px
sentiment_counts = pd.DataFrame(news_df['Sentiment'].value_counts()).reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']

# Create the bar chart using Plotly Express
fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='Sentiment', text='Count')
fig.show()