### Using NLP

In [6]:
from textblob import TextBlob

def get_sentiment_score(text):
    """
    Returns the sentiment score for a given text.
    """
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    return sentiment_score

def get_polarity_score(text):
    """
    Returns the polarity score for a given text.
    """
    blob = TextBlob(text)
    polarity_score = blob.sentiment.subjectivity
    return polarity_score

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv('crypto_news_dataset.csv')

# Data cleaning
corpus = []
ps = PorterStemmer()
for i in range(len(df)):
    text = re.sub('[^a-zA-Z]', ' ', str(df['text'][i]))
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

# Feature extraction
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()
y_text = df['text']
y_source = df['source']
y_url = df['url']
sentiment_score = get_sentiment_score(y_text)
polarity_score = get_polarity_score(y_text)
X = np.concatenate((X, sentiment_score.reshape(-1,1), polarity_score.reshape(-1,1)), axis=1)

# Model training
X_train, X_test, y_text_train, y_text_test, y_source_train, y_source_test, y_url_train, y_url_test = train_test_split(X, y_text, y_source, y_url, test_size=0.2, random_state=42)
clf = LogisticRegression(random_state=42)
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_text_train)
clf = grid_search.best_estimator_
clf.fit(X_train, y_text_train)

# Model prediction
def predict_relevance(text):
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    X_pred = tfidf.transform([text]).toarray()
    sentiment_score = get_sentiment_score(text)
    polarity_score = get_polarity_score(text)
    X_pred = np.concatenate((X_pred, sentiment_score.reshape(-1,1), polarity_score.reshape(-1,1)), axis=1)
    y_text_pred = clf.predict(X_pred)
    y_source_pred = y_source[y_text == y_text_pred][0]
    y_url_pred = y_url[y_text == y_text_pred][0]
    return y_text_pred, y_source_pred, y_url_pred

# Algorithm analysis
y_text_pred_train = clf.predict(X_train)
y_text_pred_test = clf.predict(X_test)
print('Train Accuracy:', accuracy_score(y_text_train, y_text_pred_train))
print('Test Accuracy:', accuracy_score(y_text_test, y_text_pred_test))
print('Train Precision:', precision_score(y_text_train, y_text_pred_train, average='weighted'))
print('Test Precision:', precision_score(y_text_test, y_text_pred_test, average='weighted'))
print('Train Recall:', recall_score(y_text_train, y_text_pred_train, average='weighted'))
print('Test Recall:', recall_score(y_text_test, y_text_pred_test, average='weighted'))
print('Train F1-score:', f1_score(y_text_train, y_text_pred_train, average='weighted'))
print('Test F1-score:', f1_score(y_text_test, y_text_pred_test, average='weighted'))

text = "Walmart and Litecoin Payment News Debunked by Walmart Spokesperson, LTC Prices Shudder from Fake News"
y_text_pred, y_source_pred, y_url_pred = predict_relevance(text)
print('Input Text:', text)
print('Relevant Text:', y_text_pred)
print('Source:', y_source_pred)
print('URL:', y_url_pred)

KeyboardInterrupt: 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import re
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer

def preprocess_text(text):
    """
    Preprocesses text by removing non-alphabetic characters, converting to lowercase, 
    tokenizing, and stemming.
    """
    ps = PorterStemmer()
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text]
    text = ' '.join(text)
    return text

def get_similar_articles(query, df, n=10):
    """
    Returns a list of n tuples, where each tuple contains the [text, source, url] and its relevancy score.
    """
    # preprocess query
    query = preprocess_text(query)
    
    # preprocess text in df
    df['text'] = df['text'].apply(preprocess_text)
    
    # compute TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    
    # compute cosine similarity between query and all documents
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # get indices of top n similar documents
    similar_indices = cosine_similarities.argsort()[:-n-1:-1]
    
    # create list of tuples containing [text, source, url] and relevancy score
    similar_articles = []
    for index in similar_indices:
        text = df.loc[index, 'text']
        source = df.loc[index, 'source']
        url = df.loc[index, 'url']
        score = cosine_similarities[index]
        similar_articles.append((text, source, url, score))
    
    # calculate sentiment and polarity scores
    sentiment_scores = [get_sentiment_score(text) for text, _, _, _ in similar_articles]
    polarity_scores = [get_polarity_score(text) for text, _, _, _ in similar_articles]
    
    # calculate mean and standard deviation of sentiment and polarity scores
    mean_sentiment_score = np.mean(sentiment_scores)
    std_sentiment_score = np.std(sentiment_scores)
    mean_polarity_score = np.mean(polarity_scores)
    std_polarity_score = np.std(polarity_scores)
    
    # return dictionary with analysis parameters
    analysis_params = {
        "num_similar_articles": n,
        "mean_sentiment_score": mean_sentiment_score,
        "std_sentiment_score": std_sentiment_score,
        "mean_polarity_score": mean_polarity_score,
        "std_polarity_score": std_polarity_score
    }
    
    return similar_articles, analysis_params


query = "Walmart and Litecoin Payment News Debunked by Walmart Spokesperson, LTC Prices Shudder from Fake News"
similar_articles, analysis_params = get_similar_articles(query, df)
# print list of similar articles
for article in similar_articles:
    text, source, url, score = article
    print(f"Text: {text}\nSource: {source}\nURL: {url}\nRelevancy Score: {score}\n")
    
print(f"Number of similar articles: {analysis_params['num_similar_articles']}")
print(f"Mean sentiment score: {analysis_params['mean_sentiment_score']}")
print(f"Standard deviation of sentiment score: {analysis_params['std_sentiment_score']}")
print(f"Mean polarity score: {analysis_params['mean_polarity_score']}")
# print analysis parameters
print(f"Standard deviation of polarity score: {analysis_params['polarity_std']}")
print(f"Mean of sentiment score: {analysis_params['sentiment_mean']}")
print(f"Standard deviation of sentiment score: {analysis_params['sentiment_std']}")


Text: week ago i had the pleasur of write an articl i d dream of write for month and month on end gyft add walmart gift card ala all good thing are not meant to last for the last week bitcoin around the countri have had the pleasur of buy walmart gift card with bitcoin receiv back in the form of gyft point and in essenc spend bitcoin at walmart on ga and groceri make bitcoin a cheaper option to buy ga ha been a long stand dream of the bitcoin commun thi wa the first step toward realiz that dream and said step ha now been revers despit the loss of walmart gyft is gear to provid more to smaller busi with the launch of gyft cloud gyft inform custom via email that due to reason outsid of gyft s control they are unabl to stock walmart gift card ani longer the bitcoin commun know full well that vinni at gyft would not unlist walmart gift card from gyft s impress registri unless he wa forc to there is no doubt in my mind that the initi decis that ha culmin in today s end of gyft s support of 

KeyError: 'polarity_std'