In [None]:
import nltk
from nltk.util import ngrams

# Separate analysis for informative and misinformative transcripts
informative_tokens = []
for transcript in infotranscripts_df['clean_transcript']:
  tokens = nltk.word_tokenize(transcript)
  informative_tokens.extend(tokens)

misinformative_tokens = []
for transcript in mistranscripts_df['clean_transcript']:
  tokens = nltk.word_tokenize(transcript)
  misinformative_tokens.extend(tokens)

# Generate n-grams for each group
informative_bigrams = [' '.join(gram) for gram in ngrams(informative_tokens, 2)]
informative_trigrams = [' '.join(gram) for gram in ngrams(informative_tokens, 3)]
misinformative_bigrams = [' '.join(gram) for gram in ngrams(misinformative_tokens, 2)]
misinformative_trigrams = [' '.join(gram) for gram in ngrams(misinformative_tokens, 3)]

# Combine n-grams within each group
informative_text_ngrams = informative_bigrams + informative_trigrams
misinformative_text_ngrams = misinformative_bigrams + misinformative_trigrams

In [None]:
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt

all_ngrams = informative_text_ngrams + misinformative_text_ngrams
ngram_freq = Counter(all_ngrams)
ngram_freq
# Create Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(ngram_freq)

# Plot Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from textblob import TextBlob 
def calculate_sentiment_score(text):
  blob = TextBlob(text)
  return blob.sentiment.polarity

informative_sentiment_scores = []
for ngram in informative_text_ngrams:
  score = calculate_sentiment_score(ngram)
  informative_sentiment_scores.append(score)

informative_sentiment_scores


In [None]:
misinformative_sentiment_scores = []
for ngram in misinformative_text_ngrams:
  score = calculate_sentiment_score(ngram)
  misinformative_sentiment_scores.append(score)
misinformative_sentiment_scores

In [None]:
# Assuming 'misinformative_text_ngrams' and sentiment scores are in separate lists
mis_ngrams_scores = list(zip(misinformative_text_ngrams, misinformative_sentiment_scores))
# Create a DataFrame with columns 'ngram' and 'sentiment_score'
mis_ngram_df = pd.DataFrame(mis_ngrams_scores, columns=['ngram', 'sentiment_score'])
mis_ngram_df

In [None]:
info_ngrams_scores = list(zip(informative_text_ngrams, informative_sentiment_scores))
# Create a DataFrame with columns 'ngram' and 'sentiment_score'
info_ngram_df = pd.DataFrame(info_ngrams_scores, columns=['ngram', 'sentiment_score'])
info_ngram_df

In [None]:
import seaborn as sns

# Assuming you have DataFrames 'informative_df' and 'misinformative_df' with 'sentiment_score' columns
sns.violinplot(x = "category", y = "sentiment_score", showmeans=True, data=pd.concat([info_ngram_df.assign(category='Informative'), mis_ngram_df.assign(category='Misinformative')], sort=False))
plt.xlabel("N-gram Category")
plt.ylabel("Sentiment Score")
plt.title("Distribution of Sentiment Scores by N-gram Category")
plt.show()

In [None]:
# Define sentiment score threshold (adjust as needed)
high_sentiment_threshold = 0.7

# Filter informative n-grams with high sentiment scores
informative_high_sentiment = info_ngram_df[info_ngram_df['sentiment_score'] >= high_sentiment_threshold]

# Filter misinformative n-grams with high sentiment scores
misinformative_high_sentiment = mis_ngram_df[mis_ngram_df['sentiment_score'] >= high_sentiment_threshold]

In [None]:
informative_high_sentiment_sorted = informative_high_sentiment.sort_values(by='sentiment_score', ascending=False)
misinformative_high_sentiment_sorted = misinformative_high_sentiment.sort_values(by='sentiment_score', ascending=False)
informative_high_sentiment_sorted
misinformative_high_sentiment_sorted

In [None]:
#pip install wordcloud
from wordcloud import WordCloud

def generate_wordcloud(ngram_df, color):
    # Combine all n-grams and their sentiment scores into a dictionary
    sentimented_words = {row['ngram']: row['sentiment_score'] for index, row in ngram_df.iterrows()}
    
    # Create a WordCloud object with sentiment score as word weight
    wordcloud = WordCloud(background_color='white').generate_from_frequencies(sentimented_words)
    
    # Display the WordCloud
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"High Sentiment N-grams - {color}")
    plt.show()

# Generate WordClouds for informative and misinformative high-sentiment n-grams
generate_wordcloud(informative_high_sentiment_sorted.copy(), 'Informative')
generate_wordcloud(misinformative_high_sentiment_sorted.copy(), 'Misinformative')

In [None]:
# Define sentiment score threshold (adjust as needed)
negative_sentiment_threshold = -0.3

# Filter informative n-grams with low sentiment scores
informative_low_sentiment = info_ngram_df[info_ngram_df['sentiment_score'] <= negative_sentiment_threshold]

# Filter misinformative n-grams with low sentiment scores
misinformative_low_sentiment = mis_ngram_df[mis_ngram_df['sentiment_score'] <= negative_sentiment_threshold]

In [None]:
informative_low_sentiment_sorted = informative_low_sentiment.sort_values(by='sentiment_score', ascending=True)
misinformative_low_sentiment_sorted = misinformative_low_sentiment.sort_values(by='sentiment_score', ascending=True)

In [None]:
def generate_low_wordcloud(ngram_df, color):
    # Combine all n-grams and their sentiment scores into a dictionary
    sentimented_words = {row['ngram']: row['sentiment_score'] for index, row in ngram_df.iterrows()}
    
    # Create a WordCloud object with sentiment score as word weight
    wordcloud = WordCloud(background_color='white').generate_from_frequencies(sentimented_words)
    
    # Display the WordCloud
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Low Sentiment N-grams - {color}")
    plt.show()

generate_low_wordcloud(informative_low_sentiment_sorted.copy(), 'Informative')
generate_low_wordcloud(misinformative_low_sentiment_sorted.copy(), 'Misinformative')

In [None]:
# Extract n-grams and sentiment scores for informative and misinformative dataframes
top_informative_ngrams = informative_high_sentiment_sorted.head(10)['ngram'].tolist()
top_informative_scores = informative_high_sentiment_sorted.head(10)['sentiment_score'].tolist()
top_misinformative_ngrams = misinformative_high_sentiment_sorted.head(10)['ngram'].tolist()
top_misinformative_scores = misinformative_high_sentiment_sorted.head(10)['sentiment_score'].tolist()
top_informative_ngrams, top_misinformative_ngrams

In [None]:
worst_informative_ngrams = informative_low_sentiment_sorted.head(10)['ngram'].tolist()
worst_informative_scores = informative_low_sentiment_sorted.head(10)['sentiment_score'].tolist()
worst_misinformative_ngrams = misinformative_low_sentiment_sorted.head(10)['ngram'].tolist()
worst_misinformative_scores = misinformative_low_sentiment_sorted.head(10)['sentiment_score'].tolist()
worst_informative_ngrams, worst_misinformative_ngrams