In [4]:
import re
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import nltk
nltk.download('vader_lexicon')




[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [6]:

# ----------------- 1. Hybrid Feature Engineering -----------------

class HybridCredibilityFeatures:
    def __init__(self):
        self.credible_domains = {
            'nature.com': 1.0, 'science.org': 1.0, 'edu': 0.9,
            'gov': 0.9, 'org': 0.7, 'com': 0.5,
            'thelancet.com': 1.0, 'sciencedirect.com': 0.9,
            'springer.com': 0.9, 'ieee.org': 0.9, 'acm.org': 0.9,
            'nih.gov': 1.0, 'clinical-journal.com': 0.8,
            'who.int': 1.0, 'nejm.org': 1.0, 'jamanetwork.com': 0.9,
            'webmd.com': 0.6, 'wikipedia.org': 0.4,
            'blogspot.com': 0.2, 'youtube.com': 0.2
        }
        self.academic_sites = {
            'researchgate.net', 'academia.edu', 'scholar.google.com',
            'arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'jstor.org'
        }
        self.sid = SentimentIntensityAnalyzer()

    def extract_urls(self, text):
        url_pattern = r'https?://[^\s]+'
        return re.findall(url_pattern, text)

    def score_url(self, url):
        try:
            domain = urlparse(url).netloc.lower()
            base_domain = '.'.join(domain.split('.')[-2:])

            if domain in self.academic_sites:
                return 1.0
            for credible_domain, score in self.credible_domains.items():
                if credible_domain in domain or credible_domain in base_domain:
                    return score
            return 0.3  # Default for unknown
        except:
            return 0.0

    def get_link_score(self, text):
        urls = self.extract_urls(text)
        if urls:
            return max(self.score_url(url) for url in urls) * 100
        return 0.0

    def get_string_score(self, text):
        clean_sentence = re.sub(r'https?://[^\s]+', '', text)
        sentiment_score = self.sid.polarity_scores(clean_sentence)['compound']
        return round(100.0 - (abs(sentiment_score) * 90.0), 2)

# ----------------- 2. Training & Testing Data -----------------

training_data = {
    'sentence': [
        "A study from NEJM suggests a link between this drug and reduced heart disease. https://www.nejm.org/some-article-id",
        "My doctor says this supplement will boost my immune system. http://healthylifehacks.com/immune-booster",
        "A new research paper on coffee is available at Harvard. https://harvard.edu/research/coffee-health",
        "This new weight loss method is a miracle, as seen in this video. https://www.youtube.com/watch?v=12345",
        "Vaccines for children are safe and effective, per the CDC. https://www.cdc.gov/vaccinesafety/",
        "Detox tea benefits are explored in this blog. https://detox-guru.blogspot.com/2025/08/tea.html"
    ],
    'human_score': [90.0, 30.0, 95.0, 20.0, 100.0, 40.0]
}
train_df = pd.DataFrame(training_data)

test_data = {
    'sentence': [
        "A breakthrough treatment for cancer is now available, according to this new study. http://www.breakthroughs-today.com/cancer-cure",
        "The World Health Organization published a report on influenza. https://www.who.int/influenza/report",
        "New research suggests probiotics are great for gut health. https://www.science.org/probiotics-study",
        "My secret formula for staying young is available here: http://www.my-blog-for-money.net/secret",
        "Doctors are in agreement about the incredible benefits of this new diet, as reported in this journal. https://www.clinical-journal.com/new-diet"
    ]
}
test_df = pd.DataFrame(test_data)

# ----------------- 3. Feature Extraction -----------------

fe = HybridCredibilityFeatures()

train_df['link_score'] = train_df['sentence'].apply(fe.get_link_score)
train_df['string_score'] = train_df['sentence'].apply(fe.get_string_score)

test_df['link_score'] = test_df['sentence'].apply(fe.get_link_score)
test_df['string_score'] = test_df['sentence'].apply(fe.get_string_score)

# ----------------- 4. Normalization -----------------

# Features normalization
feature_scaler = MinMaxScaler()
X_train = feature_scaler.fit_transform(train_df[['link_score', 'string_score']])
X_test = feature_scaler.transform(test_df[['link_score', 'string_score']])

# Target normalization
target_scaler = MinMaxScaler(feature_range=(0,1))
y_train = target_scaler.fit_transform(train_df[['human_score']])

# ----------------- 5. Train Model -----------------

model = LinearRegression()
model.fit(X_train, y_train)

# ----------------- 6. Predictions -----------------

y_pred_scaled = model.predict(X_test)

# Convert predictions back to 0–100 scale
y_pred_rescaled = target_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))
test_df['predicted_score'] = np.clip(y_pred_rescaled, 0, 100)

# ----------------- 7. Results -----------------

print("🔹 Trained Model Coefficients (on normalized scale):")
print(f"   Link Score Weight: {model.coef_[0][0]:.2f}")
print(f"   String Score Weight: {model.coef_[0][1]:.2f}")

print("\n🔹 Credibility Predictions for New Data (0–100 scale):")
pd.set_option('display.max_colwidth', None)
print(test_df[['sentence', 'link_score', 'string_score', 'predicted_score']])

🔹 Trained Model Coefficients (on normalized scale):
   Link Score Weight: 0.84
   String Score Weight: 0.14

🔹 Credibility Predictions for New Data (0–100 scale):
                                                                                                                                          sentence  \
0                A breakthrough treatment for cancer is now available, according to this new study. http://www.breakthroughs-today.com/cancer-cure   
1                                              The World Health Organization published a report on influenza. https://www.who.int/influenza/report   
2                                              New research suggests probiotics are great for gut health. https://www.science.org/probiotics-study   
3                                                   My secret formula for staying young is available here: http://www.my-blog-for-money.net/secret   
4  Doctors are in agreement about the incredible benefits of this new diet, as reported