Using Gemini / created a dictionary .. if-ands & ML

In [2]:
%pip install textstat

Collecting textstat
  Downloading textstat-0.7.10-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.10-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.2/239.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.10


In [1]:
pip install nltk



In [2]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [9]:
import re
import pandas as pd
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

try:
    SentimentIntensityAnalyzer()
except LookupError:
    nltk.download('vader_lexicon')

# ----------------- 1. Feature Engineering Functions -----------------

# Define domain scores on a 0-100 scale
domain_credibility_scores = {
    ".gov": 100.0,
    ".edu": 95.0,
    "nejm.org": 95.0,
    "jamanetwork.com": 90.0,
    "who.int": 100.0,
    ".org": 60.0,
    ".com": 30.0,
    "webmd.com": 50.0,
    "wikipedia.org": 40.0,
    "blogspot.com": 10.0,
    "youtube.com": 10.0,
}

def get_link_score(text_with_link: str) -> float:
    url_pattern = re.compile(r'https?://[^\s]+')
    match = url_pattern.search(text_with_link)
    link = match.group(0) if match else None

    source_score = 0.0
    if link:
        parsed_url = urlparse(link)
        domain = parsed_url.netloc.lower().replace('www.', '')

        if domain in domain_credibility_scores:
            source_score = domain_credibility_scores[domain]
        else:
            tld = '.' + domain.split('.')[-1]
            source_score = domain_credibility_scores.get(tld, 10.0)

    return round(source_score, 2)

def get_string_score(text_with_link: str) -> float:
    sid = SentimentIntensityAnalyzer()

    clean_sentence = re.sub(r'https?://[^\s]+', '', text_with_link)

    sentiment_score = sid.polarity_scores(clean_sentence)['compound']

    string_credibility_score = 100.0 - (abs(sentiment_score) * 90.0)

    return round(string_credibility_score, 2)

# ----------------- 2. Define Training and Testing Data -----------------

# Training data (with human-assigned scores)
training_data = {
    'sentence': [
        "A study from NEJM suggests a link between this drug and reduced heart disease. https://www.nejm.org/some-article-id",
        "My doctor says this supplement will boost my immune system. http://healthylifehacks.com/immune-booster",
        "A new research paper on coffee is available at Harvard. https://harvard.edu/research/coffee-health",
        "This new weight loss method is a miracle, as seen in this video. https://www.youtube.com/watch?v=12345",
        "Vaccines for children are safe and effective, per the CDC. https://www.cdc.gov/vaccinesafety/",
        "Detox tea benefits are explored in this blog. https://detox-guru.blogspot.com/2025/08/tea.html"
    ],
    'human_score': [90.0, 30.0, 95.0, 20.0, 100.0, 40.0]
}
training_df = pd.DataFrame(training_data)

# Testing data (new, unseen data without scores)
testing_data = {
    'sentence': [
        "A breakthrough treatment for cancer is now available, according to this new study. http://www.breakthroughs-today.com/cancer-cure",
        "The World Health Organization published a report on influenza. https://www.who.int/influenza/report",
        "New research suggests probiotics are great for gut health. https://www.science.org/probiotics-study",
        "My secret formula for staying young is available here: http://www.my-blog-for-money.net/secret",
        "Doctors are in agreement about the incredible benefits of this new diet, as reported in this journal. https://www.clinical-journal.com/new-diet"
    ]
}
testing_df = pd.DataFrame(testing_data)

# ----------------- 3. Prepare and Train the Model -----------------

# Prepare training features and labels
training_df['link_score'] = training_df['sentence'].apply(get_link_score)
training_df['string_score'] = training_df['sentence'].apply(get_string_score)
X_train = training_df[['link_score', 'string_score']]
y_train = training_df['human_score']

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# ----------------- 4. Make Predictions on Testing Data -----------------

# Prepare testing features
testing_df['link_score'] = testing_df['sentence'].apply(get_link_score)
testing_df['string_score'] = testing_df['sentence'].apply(get_string_score)
X_test = testing_df[['link_score', 'string_score']]

# Make predictions using the trained model
testing_df['predicted_score'] = model.predict(X_test)

# ----------------- 5. Display Final Results -----------------

print("Trained Model Coefficients:")
print(f"Link Score Weight: {model.coef_[0]:.2f}")
print(f"String Score Weight: {model.coef_[1]:.2f}")

print("\nCredibility Predictions for New Data:")
pd.set_option('display.max_colwidth', None)
print(testing_df[['sentence', 'link_score', 'string_score', 'predicted_score']])

Trained Model Coefficients:
Link Score Weight: 0.88
String Score Weight: -0.00

Credibility Predictions for New Data:
                                                                                                                                          sentence  \
0                A breakthrough treatment for cancer is now available, according to this new study. http://www.breakthroughs-today.com/cancer-cure   
1                                              The World Health Organization published a report on influenza. https://www.who.int/influenza/report   
2                                              New research suggests probiotics are great for gut health. https://www.science.org/probiotics-study   
3                                                   My secret formula for staying young is available here: http://www.my-blog-for-money.net/secret   
4  Doctors are in agreement about the incredible benefits of this new diet, as reported in this journal. https://www.clinical-journa

Using WYN360

In [6]:
import re
from urllib.parse import urlparse
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

try:
    SentimentIntensityAnalyzer()
except LookupError:
    nltk.download('vader_lexicon')


class CredibilityScorer:
    def __init__(self):
        # Previous initializations remain the same
        self.credible_domains = {
            'nature.com': 1.0, 'science.org': 1.0, 'edu': 0.9,
            'gov': 0.9, 'org': 0.7, 'com': 0.5,
            'thelancet.com': 1.0, 'sciencedirect.com': 0.9,
            'springer.com': 0.9, 'ieee.org': 0.9, 'acm.org': 0.9,
            'nih.gov': 1.0, # Added NIH for testing data
            'clinical-journal.com': 0.8 # Added for testing data
        }

        # Add academic and scientific websites
        self.academic_sites = {
            'researchgate.net', 'academia.edu', 'scholar.google.com',
            'arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'jstor.org'
        }

        # Initialize VADER sentiment analyzer
        self.sid = SentimentIntensityAnalyzer()

        # Previous initializations continue...

    def extract_urls(self, text):
        # URL regex pattern
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        return re.findall(url_pattern, text)

    def score_url(self, url):
        try:
            domain = urlparse(url).netloc.lower()
            base_domain = '.'.join(domain.split('.')[-2:])

            # Check if it's an academic site
            if domain in self.academic_sites:
                return 1.0

            # Check credible domains
            for credible_domain, score in self.credible_domains.items():
                if credible_domain in domain:
                    return score

            return 0.3  # Default score for unknown domains

        except:
            return 0.0

    def get_string_score(self, text_with_link: str) -> float:
        clean_sentence = re.sub(r'https?://[^\s]+', '', text_with_link)
        sentiment_score = self.sid.polarity_scores(clean_sentence)['compound']
        # Scale the sentiment score to a 0-100 range, where a neutral or objective
        # tone (sentiment_score close to 0) gets a higher credibility score.
        # Highly positive or negative sentiment reduces the string credibility.
        string_credibility_score = 100.0 - (abs(sentiment_score) * 90.0)
        return round(string_credibility_score, 2)

    def calculate_total_score(self, text):
        link_score = 0.0
        urls = self.extract_urls(text)
        if urls:
            # Get the maximum score from all found URLs
            link_score = max(self.score_url(url) for url in urls) * 100

        string_score = self.get_string_score(text)

        # Combine scores (you can adjust the weights here)
        # This is a simplified combination, you might use a trained model as in the previous cells
        # For this example, let's give more weight to the link score if available
        if urls:
             total_score = (link_score * 0.7) + (string_score * 0.3) # Example weighting
        else:
             total_score = string_score # If no link, rely only on string sentiment

        return round(total_score, 2)


# Example usage with URLs
def main():
    scorer = CredibilityScorer()

    test_strings = [
        "According to a 2024 study published in Nature (https://nature.com/articles/xxx), researchers at MIT found that 87% of renewable energy implementations reduced carbon emissions by an average of 45% (p<0.001).",

        "A comprehensive review on PubMed (https://pubmed.ncbi.nlm.nih.gov/12345) and The Lancet (https://www.thelancet.com/article/789) shows strong evidence for vaccine efficacy.",

        "New research from Stanford University (https://stanford.edu/research/123) and published on arXiv (https://arxiv.org/abs/1234.5678) demonstrates breakthrough in quantum computing.",

        "SHOCKING weight loss discovery! Read more at https://sketchy-diet-pills.com/amazing-results",

        "According to https://weather.com/climate-change, scientists suggest potential changes in weather patterns."
    ]

    for i, text in enumerate(test_strings, 1):
        score = scorer.calculate_total_score(text)
        print(f"\nExample {i}:")
        print(f"Text: {text}")
        print(f"Credibility Score: {score}/100")

        # Print URLs found in the text
        urls = scorer.extract_urls(text)
        if urls:
            print("URLs found:")
            for url in urls:
                print(f"- {url} (Domain score: {scorer.score_url(url)})")

if __name__ == "__main__":
    main()


Example 1:
Text: According to a 2024 study published in Nature (https://nature.com/articles/xxx), researchers at MIT found that 87% of renewable energy implementations reduced carbon emissions by an average of 45% (p<0.001).
Credibility Score: 92.62/100
URLs found:
- https://nature.com/articles/xxx), (Domain score: 1.0)

Example 2:
Text: A comprehensive review on PubMed (https://pubmed.ncbi.nlm.nih.gov/12345) and The Lancet (https://www.thelancet.com/article/789) shows strong evidence for vaccine efficacy.
Credibility Score: 82.49/100
URLs found:
- https://pubmed.ncbi.nlm.nih.gov/12345) (Domain score: 1.0)
- https://www.thelancet.com/article/789) (Domain score: 0.5)

Example 3:
Text: New research from Stanford University (https://stanford.edu/research/123) and published on arXiv (https://arxiv.org/abs/1234.5678) demonstrates breakthrough in quantum computing.
Credibility Score: 100.0/100
URLs found:
- https://stanford.edu/research/123) (Domain score: 0.9)
- https://arxiv.org/abs/1234.

The best model may be a hybrid of these two codes. The first code uses ML, while the second code uses a rule-based scoring system. Here is what a hybrid model would like like:

In [2]:
import re
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer



# ----------------- 1. Hybrid Feature Engineering -----------------

class HybridCredibilityFeatures:
    def __init__(self):
        self.credible_domains = {
            'nature.com': 1.0, 'science.org': 1.0, 'edu': 0.9,
            'gov': 0.9, 'org': 0.7, 'com': 0.5,
            'thelancet.com': 1.0, 'sciencedirect.com': 0.9,
            'springer.com': 0.9, 'ieee.org': 0.9, 'acm.org': 0.9,
            'nih.gov': 1.0, 'clinical-journal.com': 0.8,
            'who.int': 1.0, 'nejm.org': 1.0, 'jamanetwork.com': 0.9,
            'webmd.com': 0.6, 'wikipedia.org': 0.4,
            'blogspot.com': 0.2, 'youtube.com': 0.2
        }
        self.academic_sites = {
            'researchgate.net', 'academia.edu', 'scholar.google.com',
            'arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'jstor.org'
        }
        self.sid = SentimentIntensityAnalyzer()

    def extract_urls(self, text):
        url_pattern = r'https?://[^\s]+'
        return re.findall(url_pattern, text)

    def score_url(self, url):
        try:
            domain = urlparse(url).netloc.lower()
            base_domain = '.'.join(domain.split('.')[-2:])

            if domain in self.academic_sites:
                return 1.0
            for credible_domain, score in self.credible_domains.items():
                if credible_domain in domain or credible_domain in base_domain:
                    return score
            return 0.3  # Default for unknown
        except:
            return 0.0

    def get_link_score(self, text):
        urls = self.extract_urls(text)
        if urls:
            return max(self.score_url(url) for url in urls) * 100
        return 0.0

    def get_string_score(self, text):
        clean_sentence = re.sub(r'https?://[^\s]+', '', text)
        sentiment_score = self.sid.polarity_scores(clean_sentence)['compound']
        return round(100.0 - (abs(sentiment_score) * 90.0), 2)

# ----------------- 2. Training & Testing Data -----------------

training_data = {
    'sentence': [
        "A study from NEJM suggests a link between this drug and reduced heart disease. https://www.nejm.org/some-article-id",
        "My doctor says this supplement will boost my immune system. http://healthylifehacks.com/immune-booster",
        "A new research paper on coffee is available at Harvard. https://harvard.edu/research/coffee-health",
        "This new weight loss method is a miracle, as seen in this video. https://www.youtube.com/watch?v=12345",
        "Vaccines for children are safe and effective, per the CDC. https://www.cdc.gov/vaccinesafety/",
        "Detox tea benefits are explored in this blog. https://detox-guru.blogspot.com/2025/08/tea.html"
    ],
    'human_score': [90.0, 30.0, 95.0, 20.0, 100.0, 40.0]
}
train_df = pd.DataFrame(training_data)

test_data = {
    'sentence': [
        "A breakthrough treatment for cancer is now available, according to this new study. http://www.breakthroughs-today.com/cancer-cure",
        "The World Health Organization published a report on influenza. https://www.who.int/influenza/report",
        "New research suggests probiotics are great for gut health. https://www.science.org/probiotics-study",
        "My secret formula for staying young is available here: http://www.my-blog-for-money.net/secret",
        "Doctors are in agreement about the incredible benefits of this new diet, as reported in this journal. https://www.clinical-journal.com/new-diet"
    ]
}
test_df = pd.DataFrame(test_data)

# ----------------- 3. Feature Extraction -----------------

fe = HybridCredibilityFeatures()

train_df['link_score'] = train_df['sentence'].apply(fe.get_link_score)
train_df['string_score'] = train_df['sentence'].apply(fe.get_string_score)

test_df['link_score'] = test_df['sentence'].apply(fe.get_link_score)
test_df['string_score'] = test_df['sentence'].apply(fe.get_string_score)

# ----------------- 4. Normalization -----------------

# Features normalization
feature_scaler = MinMaxScaler()
X_train = feature_scaler.fit_transform(train_df[['link_score', 'string_score']])
X_test = feature_scaler.transform(test_df[['link_score', 'string_score']])

# Target normalization
target_scaler = MinMaxScaler(feature_range=(0,1))
y_train = target_scaler.fit_transform(train_df[['human_score']])

# ----------------- 5. Train Model -----------------

model = LinearRegression()
model.fit(X_train, y_train)

# ----------------- 6. Predictions -----------------

y_pred_scaled = model.predict(X_test)

# Convert predictions back to 0–100 scale
y_pred_rescaled = target_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))
test_df['predicted_score'] = np.clip(y_pred_rescaled, 0, 100)

# ----------------- 7. Results -----------------

print("🔹 Trained Model Coefficients (on normalized scale):")
print(f"   Link Score Weight: {model.coef_[0][0]:.2f}")
print(f"   String Score Weight: {model.coef_[0][1]:.2f}")

print("\n🔹 Credibility Predictions for New Data (0–100 scale):")
pd.set_option('display.max_colwidth', None)
print(test_df[['sentence', 'link_score', 'string_score', 'predicted_score']])


🔹 Trained Model Coefficients (on normalized scale):
   Link Score Weight: 0.84
   String Score Weight: 0.14

🔹 Credibility Predictions for New Data (0–100 scale):
                                                                                                                                          sentence  \
0                A breakthrough treatment for cancer is now available, according to this new study. http://www.breakthroughs-today.com/cancer-cure   
1                                              The World Health Organization published a report on influenza. https://www.who.int/influenza/report   
2                                              New research suggests probiotics are great for gut health. https://www.science.org/probiotics-study   
3                                                   My secret formula for staying young is available here: http://www.my-blog-for-money.net/secret   
4  Doctors are in agreement about the incredible benefits of this new diet, as reported