In [3]:
import mwclient    # Media Wiki Client  - library for reading Wikipedia
import time

site = mwclient.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

In [4]:
revs = list(page.revisions())   # fetching all revisions on Bitcoin page

In [5]:
revs[0]   # each revision is a dictionary

OrderedDict([('revid', 1168365247),
             ('parentid', 1168365127),
             ('user', 'Ecangola'),
             ('timestamp',
              time.struct_time(tm_year=2023, tm_mon=8, tm_mday=2, tm_hour=10, tm_min=25, tm_sec=58, tm_wday=2, tm_yday=214, tm_isdst=-1)),
             ('comment', '/* Bitcoin in mainstream politics */ fmt')])

In [6]:
# we'll sort it in the time order:
revs = sorted(revs, key=lambda rev: rev["timestamp"]) 

In [7]:
# now the first element is the oldest:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [8]:
# Now we will look for sentiment in revisions
# Depending on the number of edits per day !!!

In [9]:
# Here we're using sentiment-analysis deep learning model from transformers/pipeline:
# The model can analyze the sentiment, depending on the given text (e.g., "love" - POSITIVE
# sentiment, "hate" - NEGATIVE sentiment, etc.)

from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

  from .autonotebook import tqdm as notebook_tqdm
2023-08-09 23:02:53.031841: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-09 23:02:53.069231: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-09 23:02:53.070394: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification

In [10]:
# find_sentiment("bitcoin")

In [11]:
# we create a list of edits, that for each day include:
# 1) count of edits
# 2) sentiment list of this day's edits
 
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])  # convert time to a string format
                                                        # we're interested in day's sentiment
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)  # initialize edits 
                                                # and sentiments of the specific day
    
    edits[date]["edit_count"] += 1             # counting edits of the specific day
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

KeyboardInterrupt: 

In [10]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    del edits[key]["sentiments"]

In [11]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [12]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-08-05,1,0.748121,0.000000
2009-08-06,2,0.995746,0.000000
2009-08-14,1,0.930022,0.000000
2009-10-13,2,-0.227499,0.500000
...,...,...,...
2022-08-29,2,-0.125797,0.500000
2022-09-01,1,-0.995530,1.000000
2022-09-02,3,-0.353947,0.666667
2022-09-06,1,0.809337,0.000000


In [13]:
edits_df.index = pd.to_datetime(edits_df.index)

In [14]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [15]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [16]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2022-09-04,0,0.000000,0.00
2022-09-05,0,0.000000,0.00
2022-09-06,1,0.809337,0.00
2022-09-07,0,0.000000,0.00


In [17]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [18]:
rolling_edits = rolling_edits.dropna()

In [19]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2022-09-04,0.400000,-0.108480,0.150000
2022-09-05,0.400000,-0.108480,0.150000
2022-09-06,0.433333,-0.081502,0.150000
2022-09-07,0.433333,-0.081502,0.150000


In [21]:
rolling_edits.to_csv("wikipedia_edits.csv")