In [None]:
!pip install matplotlib scikit-learn transformers pandas newscatcherapi

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from newscatcherapi import NewsCatcherApiClient
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
import pandas as pd
import re


In [None]:
# Initialize NewsCatcher API client with your API key
newscatcherapi = NewsCatcherApiClient(x_api_key='pcbPN7A0HzJquQTcEFjBM6L0LK0n2D7tAZIRb-Dsb-c')
query = "bitcoin"


In [None]:
# Fetch Articles from NewsCatcher API
all_articles = newscatcherapi.get_search(
    q=query,
    lang='en',
    search_in='title',
    from_='1 days ago',
    countries='US',
    page_size=100,
    topic='finance',
    sort_by='rank',
    page=1
)


In [None]:
# Extract unique articles by their summaries
try:
    unique_articles = set(article['summary'] for article in all_articles['articles'])
except KeyError:
    print("Error: 'summary' not found in articles.")
    unique_articles = []


In [None]:
# Load Sentiment Analysis model (FinBERT)
model_name = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True)


In [None]:
# Analyze sentiment of each article
sentiment_scores = []
sentiment_labels = []
num_positive, num_negative, num_neutral = 0, 0, 0
article_predictions = []  # To store article summaries and their predictions

for article in unique_articles:
    try:
        pred = nlp(article)[0]
        sentiment_scores.append(pred['score'])
        sentiment_labels.append(pred['label'])
        article_predictions.append([article, pred['label']])
        print(f"Article: {article}\nPrediction: {pred}\n")
        if pred['label'] == 'positive':
            num_positive += 1
        elif pred['label'] == 'negative':
            num_negative += 1
        else:
            num_neutral += 1
    except Exception as e:
        print(f"Error processing article: {e}")


In [None]:
# Calculate Stability Index and other statistics
stability_index = sum(
    score if label == 'positive' else -score
    for score, label in zip(sentiment_scores, sentiment_labels)
) / len(sentiment_scores) if sentiment_scores else 0

mean_score = np.mean(sentiment_scores) if sentiment_scores else 0
std_deviation = np.std(sentiment_scores) if sentiment_scores else 0

# Print summary statistics
print(f"Stability Index: {stability_index}")
print(f"Mean Sentiment Score: {mean_score}")
print(f"Sentiment Standard Deviation: {std_deviation}")
print(f"Positive Articles: {num_positive}")
print(f"Negative Articles: {num_negative}")
print(f"Neutral Articles: {num_neutral}")


In [None]:
# Visualize the distribution of sentiment labels
def plot_sentiment_distribution(labels):
    counts = Counter(labels)
    labels, values = zip(*counts.items())
    plt.bar(labels, values, color=['green', 'red', 'blue'])
    plt.xlabel("Sentiment")
    plt.ylabel("Count")
    plt.title("Sentiment Distribution")
    plt.savefig('sentiment_distribution.png')  # Save the chart
    plt.show()

plot_sentiment_distribution(sentiment_labels)


In [None]:
# Visualize the distribution of sentiment scores
def plot_sentiment_scores(scores):
    plt.hist(scores, bins=10, color='purple', alpha=0.7)
    plt.xlabel("Sentiment Score")
    plt.ylabel("Frequency")
    plt.title("Distribution of Sentiment Scores")
    plt.savefig('sentiment_scores.png')  # Save the chart
    plt.show()

plot_sentiment_scores(sentiment_scores)


In [None]:
# Pie chart for sentiment proportions
labels = ['Positive', 'Negative', 'Neutral']
sizes = [num_positive, num_negative, num_neutral]
colors = ['green', 'red', 'blue']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title("Sentiment Proportions")
plt.savefig('sentiment_proportions.png')  # Save the chart
plt.show()


In [None]:
# Topic extraction from articles using TF-IDF
try:
    tfidf = TfidfVectorizer(max_features=10)
    features = tfidf.fit_transform(unique_articles)
    print("Top Topics:", tfidf.get_feature_names_out())
except ValueError as e:
    print("Error in topic extraction:", e)


In [None]:
# Summarize the articles using T5
summarizer_model_name = "t5-small"
summarizer_tokenizer = T5Tokenizer.from_pretrained(summarizer_model_name)
summarizer_model = T5ForConditionalGeneration.from_pretrained(summarizer_model_name)

for article in unique_articles:
    article = "summarize: " + article
    inputs = summarizer_tokenizer.encode(article, return_tensors="pt", max_length=512, truncation=True)
    outputs = summarizer_model.generate(
        inputs,
        max_length=50,
        min_length=15,
        length_penalty=2.0,
        num_beams=5,
        early_stopping=True
    )
    summary = summarizer_tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Summary:", summary)


In [None]:
# Save predictions to a .tsv file
df = pd.DataFrame(article_predictions, columns=["Article", "Sentiment"])
df.to_csv('article_predictions.tsv', sep='\t', index=False)


In [None]:
# Test data and corresponding labels for evaluation
test_data = [
    "The stock market is hitting new highs, and investors are optimistic about the future.",
    "The new healthcare bill is a disaster and will lead to severe consequences for millions.",
    "This movie is a masterpiece, with brilliant performances from the entire cast.",
    "The economy is in turmoil, and it seems like there is no hope for recovery.",
    "The tech industry is booming, with new innovations emerging every day.",
    "It’s a sad day for the company, as the CEO announced massive layoffs.",
    "The team's victory was incredible, showing their true strength and resilience.",
    "The political situation is becoming increasingly unstable, with widespread protests across the country.",
    "The weather today is neither hot nor cold, it’s just perfectly neutral.",
    "I can't believe how bad the customer service was today, completely frustrating!",
    "The recent advancements in AI are nothing short of revolutionary, changing the landscape of technology.",
    "Despite all the challenges, the organization managed to deliver exceptional results this quarter.",
    "The new social media app has some unique features, but it's a bit difficult to use at first.",
    "There has been a significant improvement in the company’s financial performance this year.",
    "The new restaurant in town has great food but terrible service.",
    "I feel indifferent about the movie; it was neither good nor bad.",
    "The president's speech was inspiring and lifted the spirits of the nation.",
    "There's been a major setback in the project, but the team is working hard to overcome it.",
    "Although the market is struggling, there are still opportunities for savvy investors.",
    "The new smartphone model has great features, but it's too expensive for most people."
]

test_labels = [
    "positive", "negative", "positive", "negative", "positive", "negative", 
    "positive", "negative", "neutral", "negative", "positive", "positive", 
    "neutral", "positive", "negative", "neutral", "positive", "negative", 
    "positive", "negative"
]

# Predictions from the model
predicted_labels = [nlp(text)[0]['label'] for text in test_data]


In [None]:
# Evaluate performance on test data
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels, average='weighted')
recall = recall_score(test_labels, predicted_labels, average='weighted')
f1 = f1_score(test_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(test_labels, predicted_labels, labels=["positive", "negative", "neutral"])
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["positive", "negative", "neutral"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.savefig('confusion_matrix.png')  # Save the chart
plt.show()

# Classification Report
report = classification_report(test_labels, predicted_labels, labels=["positive", "negative", "neutral"])
print("Classification Report:\n", report)


In [None]:
# Plot the distribution of the classes in test data
def plot_class_distribution(labels):
    counts = Counter(labels)
    labels, values = zip(*counts.items())
    plt.bar(labels, values, color=['green', 'red', 'blue'])
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.title("Class Distribution")
    plt.savefig('class_distribution.png')  # Save the chart
    plt.show()

plot_class_distribution(test_labels)


In [None]:
# Cross-validation with RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(test_data, test_labels, test_size=0.2, random_state=42)
cross_val_scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=5)
print(f"Cross-validation scores: {cross_val_scores}")
print(f"Mean Cross-validation score: {np.mean(cross_val_scores)}")
