<a href="https://colab.research.google.com/github/sreebalajisree/Fake_News_Detection/blob/main/Domain_Stats_Extraction_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import requests
from tqdm import tqdm
from urllib.parse import urlparse


In [2]:
train = pd.read_excel("/content/sample_data/data/Constraint_English_Train.xlsx")
fake_valid = pd.read_excel("/content/sample_data/data/Constraint_English_Val.xlsx")
test = pd.read_csv("/content/sample_data/data/Constraint_English_Test.csv")
fake_external1 = pd.read_excel("/content/sample_data/data/external_1.xlsx")
fake_external2 = pd.read_excel("/content/sample_data/data/external_2.xlsx")
df_pseudo = pd.read_csv("/content/sample_data/data/pseudo_submission.csv")

In [3]:
train.head(2)

Unnamed: 0,id,tweet,label
0,1.0,The CDC currently reports 99031 deaths. In gen...,real
1,2.0,States reported 1121 deaths a small rise from ...,real


In [4]:
tweets = list(train['tweet'])
labels = list(train['label'])

In [5]:
def get_domain(row):
  tweet=row["tweet"]
  try:
    shorturl = re.search("(?P<url>https?://[^\s]+)", tweet).group("url")
    r = requests.get(shorturl, timeout=180)
    expanded_url = r.url 
    # temp_domain = expanded_url.split('/')[2]
    domain = urlparse(expanded_url).netloc
    print(domain)
  except:
    domain=''

  return domain

train["domain"] = train.apply(lambda x: get_domain(x), 1)
test["domain"] = test.apply(lambda x: get_domain(x), 1)


twitter.com
www.thespoof.com
twitter.com
twitter.com
www.wandtv.com
t.co
www.thespoof.com
www.thelancet.com
t.co
www.politifact.com
investors.modernatx.com
t.co
twitter.com
investors.modernatx.com
www.abc.net.au
www.medscape.com
www.thespoof.com
www.medscape.com
www.politifact.com
t.co
www.thespoof.com
twitter.com
twitter.com
www.who.int
news.sky.com
twitter.com
twitter.com
twitter.com
twitter.com
www.cdc.gov
twitter.com
waterfordwhispersnews.com
www.thespoof.com
www.thespoof.com
news.sky.com
www.icmr.gov.in
news.sky.com
www.thespoof.com
twitter.com
gisgmda.maps.arcgis.com
twitter.com
news.sky.com
twitter.com
twitter.com
twitter.com
www.medscape.com
twitter.com
pib.gov.in
twitter.com
www.thespoof.com
arogya.maharashtra.gov.in
news.sky.com
www.cdc.gov
twitter.com
www.medscape.com
news.sky.com
twitter.com
inbministry.blogspot.com
t.co
www.medscape.com
twitter.com
covidtracking.com
twitter.com
www.medrxiv.org
covidactnow.org
twitter.com
twitter.com
www.factchecker.in
twitter.com
t.co
wate

# Getting Unique URL Domains

In [6]:
unique_train_domains = [each for each in list(set(list(train["domain"]))) if each!='']
unique_test_domains = [each for each in list(set(list(test["domain"]))) if each!='']

train_domains = list(train["domain"])
test_domains = list(test["domain"])


In [8]:
train.head()

Unnamed: 0,id,tweet,label,domain
0,1.0,The CDC currently reports 99031 deaths. In gen...,real,
1,2.0,States reported 1121 deaths a small rise from ...,real,twitter.com
2,3.0,Politically Correct Woman (Almost) Uses Pandem...,fake,www.thespoof.com
3,4.0,#IndiaFightsCorona: We have 1524 #COVID testin...,real,twitter.com
4,5.0,Populous states can generate large case counts...,real,twitter.com


# Computing "Fake" and "Real" probability values for URL domains

In [9]:
label_freq_dic = {}

for dom in unique_train_domains:
  label_freq_dic[dom] = {"real": 0, "fake": 0}

for dom, lab in zip(train_domains, labels):
  try:
    label_freq_dic[dom][lab]+=1
  except:
    pass

In [10]:
def calculate_prob(row, key):
  return row[key]/(row["real"]+row["fake"])

def calculate_total(row):
  return row["real"]+row["fake"]
  

In [11]:
label_freq_df = pd.DataFrame(label_freq_dic).T.reset_index()
label_freq_df = label_freq_df.rename(columns = {"index": "domain"})
label_freq_df["real_probability"] = label_freq_df.apply(lambda x: calculate_prob(x, "real"), 1)
label_freq_df["fake_probability"] = label_freq_df.apply(lambda x: calculate_prob(x, "fake"), 1)
label_freq_df["total_mentions"] = label_freq_df.apply(lambda x: calculate_total(x), 1)
label_freq_df = label_freq_df.sort_values("total_mentions", ascending=False)
label_freq_df = label_freq_df.drop(["real", "fake"], 1)
label_freq_df.head()

  import sys


Unnamed: 0,domain,real_probability,fake_probability,total_mentions
183,twitter.com,0.838903,0.161097,1167
134,news.sky.com,1.0,0.0,274
17,www.thespoof.com,0.0,1.0,253
126,www.medscape.com,1.0,0.0,231
62,t.co,0.994845,0.005155,194


In [12]:
dic = label_freq_df.set_index('domain').T.to_dict('dict')

In [14]:
import json

with open('/content/sample_data/data/train_prob_vectors_domain.json', 'w') as fp:
    json.dump(dic, fp)

In [17]:
dic

{'twitter.com': {'real_probability': 0.8389031705227078,
  'fake_probability': 0.1610968294772922,
  'total_mentions': 1167.0},
 'news.sky.com': {'real_probability': 1.0,
  'fake_probability': 0.0,
  'total_mentions': 274.0},
 'www.thespoof.com': {'real_probability': 0.0,
  'fake_probability': 1.0,
  'total_mentions': 253.0},
 'www.medscape.com': {'real_probability': 1.0,
  'fake_probability': 0.0,
  'total_mentions': 231.0},
 't.co': {'real_probability': 0.9948453608247423,
  'fake_probability': 0.005154639175257732,
  'total_mentions': 194.0},
 'www.politifact.com': {'real_probability': 0.0,
  'fake_probability': 1.0,
  'total_mentions': 123.0},
 'www.cdc.gov': {'real_probability': 1.0,
  'fake_probability': 0.0,
  'total_mentions': 102.0},
 'covidtracking.com': {'real_probability': 1.0,
  'fake_probability': 0.0,
  'total_mentions': 93.0},
 'newsthump.com': {'real_probability': 0.0,
  'fake_probability': 1.0,
  'total_mentions': 68.0},
 'pib.gov.in': {'real_probability': 1.0,
  'fak

In [18]:
prob_dic = {}

for i, j in label_freq_dic.items():
  try:
    fp = j["fake"]/(j["real"]+j["fake"])
  except:
    fp=0
  try:
    rp = j["real"]/(j["real"]+j["fake"])
  except:
    rp=0
  prob_dic[i] = {"fake_probability": fp, "real_probability": rp, "total_mentions": j["real"]+j["fake"]}

# prob_dic = {k: v for k, v in sorted(x.items(), key=lambda item: item[1])}

In [19]:
import collections

final_dic = collections.OrderedDict(sorted(prob_dic.items(), key=lambda t:t[1]["total_mentions"], reverse=True))


In [20]:
final_dic

OrderedDict([('twitter.com',
              {'fake_probability': 0.1610968294772922,
               'real_probability': 0.8389031705227078,
               'total_mentions': 1167}),
             ('news.sky.com',
              {'fake_probability': 0.0,
               'real_probability': 1.0,
               'total_mentions': 274}),
             ('www.thespoof.com',
              {'fake_probability': 1.0,
               'real_probability': 0.0,
               'total_mentions': 253}),
             ('www.medscape.com',
              {'fake_probability': 0.0,
               'real_probability': 1.0,
               'total_mentions': 231}),
             ('t.co',
              {'fake_probability': 0.005154639175257732,
               'real_probability': 0.9948453608247423,
               'total_mentions': 194}),
             ('www.politifact.com',
              {'fake_probability': 1.0,
               'real_probability': 0.0,
               'total_mentions': 123}),
             ('www.cdc.gov',
   