In [1]:
import mwclient
import time

# * Get Wikipedia pages
site = mwclient.Site("en.wikipedia.org")
page = site.pages["Tesla, Inc."]

In [2]:
# * Get Wikipedia entry revisions & sort in ascending order
revisions = list(page.revisions())

In [4]:
revisions = sorted(revisions, key=lambda rev: rev["timestamp"])

In [6]:
from transformers import pipeline
model_id = "KernAI/stock-news-distilbert"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_id)

def find_sentiment(text):
    """
    Finds sentiment of text
    """
    sentiment = sentiment_pipeline([text[:250]])[0]
    score = sentiment["score"]
    if sentiment["label"].lower() == "negative":
        score *= -1
    if sentiment["label"].lower() == "neutral":
        score *= 0
    return score

  logger.warn(
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [7]:
find_sentiment("I hate you")

-0.979550302028656

In [8]:
find_sentiment("I love you")

0.9959249496459961

In [9]:
find_sentiment("I'm neutral")

0.0

In [10]:
# * Get Wikipedia entry edits with sentiment scores
edits = {}

for rev in revisions:
    try:
        date = time.strftime("%Y-%m-%d", rev["timestamp"])
    
        if date not in edits:
            edits[date] = dict(sentiments=list(), edit_count=0)
    
        edits[date]["edit_count"] += 1
    
        comment = rev.get("comment", rev.get("commenthidden", ""))
        edits[date]["sentiments"].append(find_sentiment(comment))
    except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print(inst)          # __str__ allows args to be printed directly,
                             # but may be overridden in exception subclasses
        print(rev)

In [11]:
# * Calculate individual sentiment scores on datetime index period
# yFinance data is limited to a minimum period of 24 hours
# TODO: Get dataset with higher frequency. Ideally, 15min or 30min period.
from statistics import mean

for key in edits:
    try:
        if len(edits[key]["sentiments"]) > 0:
            edits[key]["sentiment"] = mean(edits[key]["sentiments"])
            edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
        else:
            edits[key]["sentiment"] = 0
            edits[key]["neg_sentiment"] = 0
    
        del edits[key]["sentiments"]
    except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     
        print(inst)          
        print(edits[key])

In [12]:
edits

{'2006-06-12': {'edit_count': 2,
  'sentiment': 0.7899736166000366,
  'neg_sentiment': 0.0},
 '2006-06-16': {'edit_count': 1,
  'sentiment': 0.7899736166000366,
  'neg_sentiment': 0.0},
 '2006-06-20': {'edit_count': 2,
  'sentiment': 0.7899736166000366,
  'neg_sentiment': 0.0},
 '2006-06-29': {'edit_count': 1,
  'sentiment': 0.7899736166000366,
  'neg_sentiment': 0.0},
 '2006-07-03': {'edit_count': 3,
  'sentiment': 0.5266490777333578,
  'neg_sentiment': 0.0},
 '2006-07-15': {'edit_count': 1, 'sentiment': 0.0, 'neg_sentiment': 0.0},
 '2006-07-16': {'edit_count': 1, 'sentiment': 0.0, 'neg_sentiment': 0.0},
 '2006-07-20': {'edit_count': 6,
  'sentiment': 0.5266490777333578,
  'neg_sentiment': 0.0},
 '2006-07-21': {'edit_count': 10,
  'sentiment': 0.7109762549400329,
  'neg_sentiment': 0.0},
 '2006-07-22': {'edit_count': 3,
  'sentiment': 0.5266490777333578,
  'neg_sentiment': 0.0},
 '2006-07-23': {'edit_count': 1, 'sentiment': 0.0, 'neg_sentiment': 0.0},
 '2006-07-24': {'edit_count': 2, 

In [13]:
# * Create dataframe with complete daily entries within the date range
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [14]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-06-12,2,0.789974,0.0
2006-06-16,1,0.789974,0.0
2006-06-20,2,0.789974,0.0
2006-06-29,1,0.789974,0.0
2006-07-03,3,0.526649,0.0
...,...,...,...
2023-07-28,4,0.000000,0.0
2023-07-29,1,0.789974,0.0
2023-07-30,3,0.000000,0.0
2023-07-31,1,0.000000,0.0


In [15]:
edits_df.index = pd.to_datetime(edits_df.index) # convert index to datetime

In [38]:
from datetime import datetime

default_start_date = datetime.fromtimestamp(datetime.timestamp(edits_df.index[0]))
dates = pd.date_range(start=default_start_date, end=datetime.today()) # create range from start of data to today

In [39]:
dates

DatetimeIndex(['2006-06-12', '2006-06-13', '2006-06-14', '2006-06-15',
               '2006-06-16', '2006-06-17', '2006-06-18', '2006-06-19',
               '2006-06-20', '2006-06-21',
               ...
               '2023-07-23', '2023-07-24', '2023-07-25', '2023-07-26',
               '2023-07-27', '2023-07-28', '2023-07-29', '2023-07-30',
               '2023-07-31', '2023-08-01'],
              dtype='datetime64[ns]', length=6260, freq='D')

In [40]:
edits_df = edits_df.reindex(dates, fill_value=0) # reindex with complete date range

In [41]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-06-12,2,0.789974,0.0
2006-06-13,0,0.000000,0.0
2006-06-14,0,0.000000,0.0
2006-06-15,0,0.000000,0.0
2006-06-16,1,0.789974,0.0
...,...,...,...
2023-07-28,4,0.000000,0.0
2023-07-29,1,0.789974,0.0
2023-07-30,3,0.000000,0.0
2023-07-31,1,0.000000,0.0


In [42]:
# * Add rolling data & clean dataframe
# 30-day mean
rolling_edits = edits_df.rolling(30).mean()

In [43]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-06-12,,,
2006-06-13,,,
2006-06-14,,,
2006-06-15,,,
2006-06-16,,,
...,...,...,...
2023-07-28,1.166667,0.004056,0.05
2023-07-29,1.200000,0.030389,0.05
2023-07-30,1.300000,0.030389,0.05
2023-07-31,1.300000,0.030389,0.05


In [44]:
rolling_edits = rolling_edits.dropna()

In [45]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2006-07-11,0.300000,0.122885,0.00
2006-07-12,0.233333,0.096552,0.00
2006-07-13,0.233333,0.096552,0.00
2006-07-14,0.233333,0.096552,0.00
2006-07-15,0.266667,0.096552,0.00
...,...,...,...
2023-07-28,1.166667,0.004056,0.05
2023-07-29,1.200000,0.030389,0.05
2023-07-30,1.300000,0.030389,0.05
2023-07-31,1.300000,0.030389,0.05


In [46]:
# * Save locally as CSV
rolling_edits.to_csv("wikipedia_edits.csv")