In [1]:
! pip install pandas
! pip install scikit-learn
! pip install xgboost
! pip install mwclient
! pip install yfinance
! pip install matplotlib
! pip install transformers



In [27]:
import mwclient
import time

site = mwclient.Site('en.wikipedia.org')
page = site.pages['Hermès']

In [28]:
revs = list(page.revisions())

In [29]:
revs[0]

OrderedDict([('revid', 1159933895),
             ('parentid', 1158710120),
             ('user', 'Keisigei'),
             ('timestamp',
              time.struct_time(tm_year=2023, tm_mon=6, tm_mday=13, tm_hour=13, tm_min=4, tm_sec=57, tm_wday=1, tm_yday=164, tm_isdst=-1)),
             ('comment', '/* Hermès Frères era */')])

In [30]:
revs = sorted(revs, key=lambda rev: rev["timestamp"]) 

In [31]:
revs[0]

OrderedDict([('revid', 14874635),
             ('parentid', 0),
             ('user', 'Ted Wilkes'),
             ('timestamp',
              time.struct_time(tm_year=2005, tm_mon=6, tm_mday=8, tm_hour=13, tm_min=16, tm_sec=17, tm_wday=2, tm_yday=159, tm_isdst=-1)),
             ('comment', 'made new stub article')])

In [32]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [91]:
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

In [117]:
from statistics import mean
for key in edits:
    if "sentiments" in edits[key]:
        if len(edits[key]["sentiments"]) > 0:
            edits[key]["sentiment"] = mean(edits[key]["sentiments"])
            edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
        else:
            edits[key]["sentiment"] = 0
            edits[key]["neg_sentiment"] = 0
        del edits[key]["sentiments"]

In [118]:
edits

{'2005-06-08': {'edit_count': 3,
  'sentiment': 0.17164989312489828,
  'neg_sentiment': 0.3333333333333333},
 '2005-06-19': {'edit_count': 1,
  'sentiment': 0.748120903968811,
  'neg_sentiment': 0.0},
 '2005-06-24': {'edit_count': 1,
  'sentiment': 0.748120903968811,
  'neg_sentiment': 0.0},
 '2005-06-25': {'edit_count': 2,
  'sentiment': 0.8721365034580231,
  'neg_sentiment': 0.0},
 '2005-06-28': {'edit_count': 1,
  'sentiment': -0.9986476302146912,
  'neg_sentiment': 1.0},
 '2005-06-30': {'edit_count': 1,
  'sentiment': -0.9989820122718811,
  'neg_sentiment': 1.0},
 '2005-07-05': {'edit_count': 1,
  'sentiment': 0.9987383484840393,
  'neg_sentiment': 0.0},
 '2005-07-19': {'edit_count': 1,
  'sentiment': 0.9797213077545166,
  'neg_sentiment': 0.0},
 '2005-07-22': {'edit_count': 1,
  'sentiment': -0.9973267316818237,
  'neg_sentiment': 1.0},
 '2005-07-31': {'edit_count': 1,
  'sentiment': 0.9947939515113831,
  'neg_sentiment': 0.0},
 '2005-08-26': {'edit_count': 1,
  'sentiment': 0.748

In [119]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [120]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2005-06-08,3,0.171650,0.333333
2005-06-19,1,0.748121,0.000000
2005-06-24,1,0.748121,0.000000
2005-06-25,2,0.872137,0.000000
2005-06-28,1,-0.998648,1.000000
...,...,...,...
2023-05-13,1,-0.999136,1.000000
2023-05-20,1,-0.962584,1.000000
2023-05-26,3,0.748121,0.000000
2023-06-05,1,-0.991793,1.000000


In [121]:
edits_df.index = pd.to_datetime(edits_df.index)

In [125]:
from datetime import datetime

dates = pd.date_range(start="2005-06-08",end=datetime.today())

In [126]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [127]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2005-06-08,0,0.0,0.0
2005-06-09,0,0.0,0.0
2005-06-10,0,0.0,0.0
2005-06-11,0,0.0,0.0
2005-06-12,0,0.0,0.0
...,...,...,...
2023-07-16,0,0.0,0.0
2023-07-17,0,0.0,0.0
2023-07-18,0,0.0,0.0
2023-07-19,0,0.0,0.0


In [128]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [129]:
rolling_edits = rolling_edits.dropna()

In [130]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2005-07-07,0.0,0.0,0.0
2005-07-08,0.0,0.0,0.0
2005-07-09,0.0,0.0,0.0
2005-07-10,0.0,0.0,0.0
2005-07-11,0.0,0.0,0.0
...,...,...,...
2023-07-16,0.0,0.0,0.0
2023-07-17,0.0,0.0,0.0
2023-07-18,0.0,0.0,0.0
2023-07-19,0.0,0.0,0.0


In [131]:
rolling_edits.to_csv("wikipedia_edits.csv")