In [None]:
import stanfordnlp

# Download the English model
stanfordnlp.download('en')

# Initialize the NLP pipeline
nlp = stanfordnlp.Pipeline(processors='tokenize,ner')

# Sample news article text
news_article = "According to Apple's CEO, Tim Cook, the company plans to release new products this year."

# Process the news article with NER (Named Entity Recognition)
doc = nlp(news_article)

# Extract named entities (references)
references = []
for sent in doc.sentences:
    for entity in sent.ents:
        references.append(entity.text)

print("References:", references)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Sample data: list of news articles and their corresponding labels (1 for objective, 0 for not)
news_articles = [
    "According to Apple's CEO, Tim Cook, the company plans to release new products this year.",
    "Scientists have discovered a new species of bird in the Amazon rainforest.",
    "Get rich quick with our amazing investment opportunity!"
]
labels = [1, 1, 0]

# Initialize the feature extractor (CountVectorizer)
vectorizer = CountVectorizer()

# Vectorize the news articles
X = vectorizer.fit_transform(news_articles)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Initialize the Logistic Regression classifier
classifier = LogisticRegression()

# Train the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample news titles
news_titles = [
    "Stock Market Soars to Record Highs",
    "Investors Wary of Economic Uncertainty",
    "Tech Company Reports Strong Q2 Earnings",
    "Breaking: Fake News Affects Market Sentiment"
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(news_titles)

# Convert sparse matrix to dense matrix
dense_matrix = tfidf_matrix.toarray()

# Get the feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Display the TF-IDF vectors and feature names
for i, title in enumerate(news_titles):
    print("News Title:", title)
    print("TF-IDF Vector:", dense_matrix[i])
    print("Feature Names:", feature_names)
    print("="*40)


In [None]:
import numpy as np

# Mock dataset (news_title, news_title, similarity_label)
dataset = [
    ("Stock Market Soars to Record Highs", "Dow Jones Hits All-Time High", 1),
    ("Investors Wary of Economic Uncertainty", "Tech Stocks Surge Despite Uncertainty", 0),
    ("Tech Company Reports Strong Q2 Earnings", "Apple's Earnings Report Impresses Investors", 1),
    ("Breaking: Fake News Affects Market Sentiment", "Markets React to False Reports", 0),
]

# Create training data
train_data = []
labels = []

for item in dataset:
    train_data.append(item[0])
    train_data.append(item[1])
    labels.append(item[2])

# Convert labels to numpy array
labels = np.array(labels)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize and pad the news titles
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
sequences = tokenizer.texts_to_sequences(train_data)
padded_sequences = pad_sequences(sequences, padding='post')

# Siamese Network architecture
input_layer = Input(shape=(padded_sequences.shape[1],))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50)(input_layer)
flattened_layer = Flatten()(embedding_layer)
dense_layer = Dense(128, activation='relu')(flattened_layer)

# Siamese model
siamese_model = Model(inputs=input_layer, outputs=dense_layer)

# Create the left and right input branches
input_left = Input(shape=(padded_sequences.shape[1],))
input_right = Input(shape=(padded_sequences.shape[1],))

# Connect both inputs to the siamese model
output_left = siamese_model(input_left)
output_right = siamese_model(input_right)

# Calculate L1 distance between outputs
distance = Lambda(lambda x: tf.abs(x[0] - x[1]))([output_left, output_right])

# Final prediction layer
prediction = Dense(1, activation='sigmoid')(distance)

# Create the siamese network model
siamese_network = Model(inputs=[input_left, input_right], outputs=prediction)

# Compile the model
siamese_network.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
siamese_network.fit([padded_sequences[::2], padded_sequences[1::2]], labels, epochs=10, batch_size=2)


In [None]:
import numpy as np

# Mock dataset (news_title1, news_title2, similarity_label)
dataset = [
    ("Stock Market Soars to Record Highs", "Dow Jones Hits All-Time High", 1),
    ("Investors Wary of Economic Uncertainty", "Tech Stocks Surge Despite Uncertainty", 0),
    ("Tech Company Reports Strong Q2 Earnings", "Apple's Earnings Report Impresses Investors", 1),
    ("Breaking: Fake News Affects Market Sentiment", "Markets React to False Reports", 0),
]

# Create training data
train_data = []
labels = []

for item in dataset:
    train_data.append((item[0], item[1]))  # Adding both news titles
    labels.append(item[2])

# Convert labels to numpy array
labels = np.array(labels)

# Shuffle the dataset
indices = np.arange(len(train_data))
np.random.shuffle(indices)
train_data = [train_data[i] for i in indices]
labels = labels[indices]

# Split the dataset into train and validation sets
split_ratio = 0.8
split_idx = int(len(train_data) * split_ratio)
train_pairs = train_data[:split_idx]
train_labels = labels[:split_idx]
val_pairs = train_data[split_idx:]
val_labels = labels[split_idx:]


In [None]:
!pip install gensim


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# Sample news titles
news_titles = [
    "Stock Market Soars to Record Highs",
    "Dow Jones Hits All-Time High",
    "Investors Wary of Economic Uncertainty",
    "Tech Stocks Surge Despite Uncertainty"
]

# Preprocessing and tokenization
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    return words

# Tokenize and preprocess news titles
preprocessed_titles = [preprocess_text(title) for title in news_titles]

# Train Word2Vec model
model = Word2Vec(sentences=preprocessed_titles, vector_size=100, window=5, min_count=1, sg=0)

# Convert news titles to word embeddings
title_embeddings = [model.wv[title] for title in preprocessed_titles]

# Display word embeddings
for i, title in enumerate(news_titles):
    print(f"News Title: {title}")
    print(f"Word Embedding: {title_embeddings[i]}\n")


In [None]:
import requests
from bs4 import BeautifulSoup

# Define the URL of the news website
url = "https://www.example-news-website.com"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all the news article links on the page
article_links = soup.find_all("a", class_="article-link")

# Loop through the article links and extract relevant information
for link in article_links:
    article_url = link["href"]
    article_title = link.text

    # Visit the article URL and scrape the content
    article_response = requests.get(article_url)
    article_soup = BeautifulSoup(article_response.content, "html.parser")

    # Extract the article content
    article_content = article_soup.find("div", class_="article-content").get_text()

    # Perform additional processing on the article content
    # Extract mentions of companies, numbers, etc.

    print("Title:", article_title)
    print("URL:", article_url)
    print("Content:", article_content)
    print("\n")


In [None]:
import re

text = "The stock market gained 100 points yesterday, while Company XYZ's shares increased by $5.50."

# Regular expression pattern to match numbers with stock market indicators
pattern = r'\b(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:points|shares|\$)\b'

# Find all matches in the text
matches = re.findall(pattern, text, re.IGNORECASE)

# Print the extracted numbers
for match in matches:
    print(match)


In [None]:
import re

def extract_numbers_with_indicators(text):
    # Regular expression pattern to match numbers with stock market indicators
    pattern = r'\b(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:points|shares|\$)\b'

    # Find all matches in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    return matches

text = "The stock market gained 100 points yesterday, while Company XYZ's shares increased by $5.50."
numbers_with_indicators = extract_numbers_with_indicators(text)

# Print the extracted numbers with indicators
for number in numbers_with_indicators:
    print(number)


In [None]:
import re

def normalize_author_name(author_name):
    # Remove spaces and convert to lowercase
    normalized_name = author_name.replace(" ", "").lower()
    return normalized_name

def normalize_website_url(website_url):
    # Remove protocol and www subdomain, then lowercase
    normalized_url = re.sub(r'(https?://)?(www\.)?', '', website_url, flags=re.IGNORECASE)
    normalized_url = normalized_url.lower()
    return normalized_url

author_name = "John Doe"
website_url = "https://www.Example.com"

normalized_author = normalize_author_name(author_name)
normalized_url = normalize_website_url(website_url)

print("Normalized Author:", normalized_author)
print("Normalized URL:", normalized_url)
