In [164]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import requests
import datetime
import dateutil
import numpy as np
from dateutil.relativedelta import relativedelta
from datetime import datetime
import ast 
from collections import Counter

In [165]:
language = "en"

In [173]:
#df = pd.read_csv("df.csv", converters={'redirects': pd.eval, 'list_views_7_days':pd.eval, 'list_views_before':pd.eval, 'list_views_7_days_redirects':pd.eval, 'categories': pd.eval,})

In [167]:
def queryEventData():
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery("""
    SELECT ?event ?eventLabel ?date ?countryLabel ?start ?end
    WHERE
    {
      ?event  p:P585/ps:P585 ?date.
      OPTIONAL { ?event wdt:P580 ?start. }
      OPTIONAL { ?event wdt:P582 ?end. }
      FILTER(?date < "+2020-12-31T00:00:00Z"^^xsd:dateTime).
      FILTER(?date > "+2020-01-01T00:00:00Z"^^xsd:dateTime).
      ?event wdt:P17 ?country.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results_df = pd.io.json.json_normalize(results['results']['bindings'])
    df = results_df[['event.value', 'eventLabel.value', 'date.value', 'countryLabel.value', 'start.value', 'end.value']]
    df = df.rename(columns={'event.value': 'event_url', 
                            'eventLabel.value': 'event_label',
                            'date.value': 'date',
                            'countryLabel.value': 'country',
                            'start.value': 'start_date',
                            'end.value': 'end_date'})
    return df

def getDate(date, start_date):
    if pd.isnull(start_date):
        return date
    else:
        return start_date
    
def getEventID(event):
    return event.split("entity/")[-1]

In [62]:
# query event data from dewiki from 2020
df = queryEventData()
# select event_date -- if start_date is set -> start_date else: point_in_time (date)
df['event_date'] = df.apply(lambda row : getDate(row['date'],row['start_date']), axis = 1)
# extract wikidata item id
df['event_id'] = df.apply(lambda row : getEventID(row['event_url']), axis = 1)
df.head()

  results_df = pd.io.json.json_normalize(results['results']['bindings'])


Unnamed: 0,event_url,event_label,date,country,start_date,end_date,event_date,event_id
0,http://www.wikidata.org/entity/Q65077171,62nd Annual Grammy Awards,2020-01-26T00:00:00Z,United States of America,,,2020-01-26T00:00:00Z,Q65077171
1,http://www.wikidata.org/entity/Q66717760,2020 Emilia-Romagna regional election,2020-01-26T00:00:00Z,Italy,,,2020-01-26T00:00:00Z,Q66717760
2,http://www.wikidata.org/entity/Q66717761,2020 Calabrian regional election,2020-01-26T00:00:00Z,Italy,,,2020-01-26T00:00:00Z,Q66717761
3,http://www.wikidata.org/entity/Q67463160,Q67463160,2020-01-26T00:00:00Z,Austria,,,2020-01-26T00:00:00Z,Q67463160
4,http://www.wikidata.org/entity/Q69381902,2020 Peruvian parliamentary election,2020-01-26T00:00:00Z,Peru,,,2020-01-26T00:00:00Z,Q69381902


In [49]:
############################################''######################################################################

In [63]:
def getPagetitle(event_id, language):
    wiki = language+'wiki'
    URL = "https://www.wikidata.org/w/api.php"
    PARAMS = {
        "action": "wbgetentities",
        "format": "json",
        "props":"sitelinks",
        "sitefilter": wiki,
        "ids": event_id
    }
    data = requests.get(url=URL, params=PARAMS).json()
    try:
        title = data['entities'][event_id]['sitelinks'][wiki]['title']
    except:
        title = ""
    return title

In [64]:
df['pagetitle'] = df.apply(lambda row : getPagetitle(row['event_id'], language), axis = 1)
# keep wikidata items with pagetitle 
df = df[df['pagetitle'] != ""]
# drop duplicates
df = df[~df.duplicated(['pagetitle'], keep=False)]

In [None]:
##################################################################################################################

In [17]:
def getRedirects(pagetitle, language):
    URL = "https://"+language+".wikipedia.org/w/api.php"
    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": pagetitle,
        "prop": "redirects",
        "rdlimit":"100"
    }
    data = requests.get(url=URL, params=PARAMS).json()
    page = data["query"]["pages"]

    redirects = []
    try:
        for key, value in page.items():
            for redirect in value["redirects"]:
                redirects.append(redirect["title"])
    except:
        redirects = []
    return redirects

In [18]:
df['redirects'] = df.apply(lambda row : getRedirects(row['pagetitle'],language), axis = 1)
df.head()

Unnamed: 0,event_url,event_label,date,country,start_date,end_date,event_date,event_id,pagetitle,redirects
0,http://www.wikidata.org/entity/Q17020570,2021 South Sudanese general election,2015-07-01T00:00:00Z,South Sudan,,,2015-07-01,Q17020570,2023 South Sudanese general election,"[Next South Sudanese presidential election, So..."
1,http://www.wikidata.org/entity/Q20514322,2015 Swedish Open,2015-07-01T00:00:00Z,Sweden,,,2015-07-01,Q20514322,2015 Swedish Open,[]
2,http://www.wikidata.org/entity/Q20639899,2015 Ji'an bus accident,2015-07-01T00:00:00Z,People's Republic of China,,,2015-07-01,Q20639899,2015 Ji'an bus accident,[]
3,http://www.wikidata.org/entity/Q20635386,2015 Chama Cha Mapinduzi presidential primaries,2015-07-01T00:00:00Z,Tanzania,,,2015-07-01,Q20635386,2015 Chama Cha Mapinduzi presidential primaries,"[CCM presidential candidates, 2015, CCM presid..."
4,http://www.wikidata.org/entity/Q20646862,Aleppo offensive (July 2015),2015-07-07T00:00:00Z,Syria,2015-07-02T00:00:00Z,2015-07-07T00:00:00Z,2015-07-02,Q20646862,Aleppo offensive (July 2015),[]


In [None]:
##################################################################################################################

In [14]:
def getPageCreationDate(pagetitle, language):
    URL = "https://"+language+".wikipedia.org/w/api.php"
    PARAMS = {
        "action": "query",
        "prop": "revisions",
        "rvlimit":"1",
        "titles": pagetitle,
        "rvprop": "timestamp",
        "rvdir": "newer",
        'format': 'json',
    }

    data = requests.get(url=URL, params=PARAMS).json()
    try:
        page = next(iter(data["query"]["pages"].values()))
        pagecreation = page["revisions"][0]["timestamp"]
    except:
        pagecreation = ""
    return pagecreation

def calcDiffDays(event_date, page_creation):
    page_creation = datetime.strptime(page_creation,'%Y-%m-%dT%H:%M:%SZ')
    event_date = datetime.strptime(event_date,'%Y-%m-%dT%H:%M:%SZ')
    diff = page_creation - event_date
    return diff.days

In [13]:
df = df[df.page_creation.isnull() == False]

In [15]:
df['page_creation'] = df.apply(lambda row : getPageCreationDate(row['pagetitle'],language), axis = 1)
# calc page creation date - event date 
# positiv if page was created after event happend
# negative if page was created before event happend
df.loc['diff_days'] = df.apply(lambda row : calcDiffDays(row['event_date'],row['page_creation']), axis = 1)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['diff_days'] = df.apply(lambda row : calcDiffDays(row['event_date'],row['page_creation']), axis = 1)


Unnamed: 0,event_url,event_label,date,country,start_date,end_date,event_date,event_id,pagetitle,redirects,page_creation,list_views_7_days,list_views_7_days_redirects,diff_days
0,http://www.wikidata.org/entity/Q17020570,2021 South Sudanese general election,2015-07-01T00:00:00Z,South Sudan,,,2015-07-01T00:00:00Z,Q17020570,2023 South Sudanese general election,"[Next South Sudanese presidential election, So...",2013-07-28T20:05:12Z,[],"[1, 1, 2, 1, 3, 2, 7, 6, 2, 3, 1, 1, 1, 5, 5, ...",-703
1,http://www.wikidata.org/entity/Q20514322,2015 Swedish Open,2015-07-01T00:00:00Z,Sweden,,,2015-07-01T00:00:00Z,Q20514322,2015 Swedish Open,[],2015-06-20T21:48:47Z,"[304, 296, 356, 356, 414, 449, 539]",[],-11
2,http://www.wikidata.org/entity/Q20639899,2015 Ji'an bus accident,2015-07-01T00:00:00Z,People's Republic of China,,,2015-07-01T00:00:00Z,Q20639899,2015 Ji'an bus accident,[],2015-07-02T04:59:36Z,"[60, 507, 103, 148, 143, 200]",[],1
3,http://www.wikidata.org/entity/Q20635386,2015 Chama Cha Mapinduzi presidential primaries,2015-07-01T00:00:00Z,Tanzania,,,2015-07-01T00:00:00Z,Q20635386,2015 Chama Cha Mapinduzi presidential primaries,"[CCM presidential candidates, 2015, CCM presid...",2015-06-30T04:00:05Z,[],"[5, 2, 1, 1, 1, 313, 130, 57, 36, 104, 310, 109]",-1
4,http://www.wikidata.org/entity/Q20646862,Aleppo offensive (July 2015),2015-07-07T00:00:00Z,Syria,2015-07-02T00:00:00Z,2015-07-07T00:00:00Z,2015-07-02T00:00:00Z,Q20646862,Aleppo offensive (July 2015),[],2015-07-07T16:48:02Z,"[777, 1595]",[],5


In [None]:
##################################################################################################################

In [16]:
headers = {"User-Agent": "steinkasserer@student.tugraz.at"}

def extractViews(json_data):
    list_views = []
    if "items" in json_data:
        for i in json_data["items"]:
            list_views.append(i["views"])
    return list_views
            
def getViews(pagetitle, event_date, days, language):   
    from_date = datetime.strptime(event_date,'%Y-%m-%dT%H:%M:%SZ')      
    time_range = dateutil.relativedelta.relativedelta(days=days)
    to_date = from_date + time_range
    
    from_date = from_date.strftime('%Y%m%d')
    to_date = to_date.strftime('%Y%m%d')
    
    url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"+language+".wikipedia.org/all-access/all-agents/"+str(pagetitle)+"/daily/"+from_date+"/"+to_date
    data = requests.get(url=url, headers=headers).json()
    list_views = extractViews(data)
    return list_views

def getViewsRedirects(list_redirects, event_date, days, language):
    from_date = datetime.strptime(event_date,'%Y-%m-%dT%H:%M:%SZ')
    time_range = dateutil.relativedelta.relativedelta(days=days)
    to_date = from_date + time_range
    
    from_date = from_date.strftime('%Y%m%d')
    to_date = to_date.strftime('%Y%m%d')
    
    list_views = []
    for title in list_redirects:
        url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"+language+".wikipedia.org/all-access/all-agents/"+str(title)+"/daily/"+from_date+"/"+to_date
        data = requests.get(url=url, headers=headers).json()
        views = extractViews(data)
        list_views = list_views + views
    return list_views

def getViewsBefore(pagetitle, page_creation, event_date, days, language):
    to_date = datetime.strptime(event_date,'%Y-%m-%dT%H:%M:%SZ')
    from_date = datetime.strptime(page_creation,'%Y-%m-%dT%H:%M:%SZ')
    
    if(from_date == to_date):
        return []
    
    to_date = to_date - dateutil.relativedelta.relativedelta(days=1)
    time_range = dateutil.relativedelta.relativedelta(days=days)
    from_date = to_date - time_range
    
    from_date = from_date.strftime('%Y%m%d')
    to_date = to_date.strftime('%Y%m%d')
    
    url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"+language+".wikipedia.org/all-access/all-agents/"+str(pagetitle)+"/daily/"+from_date+"/"+to_date
    data = requests.get(url=url, headers=headers).json()
    list_views = extractViews(data)
    return list_views

def aggregateViews(list_views,func):
    if len(list_views) > 0:
        return func(list_views)
    else:
        return 0

def increase(list_views):
    first = list_views[0]
    last = list_views[-1]
    return (last-first)

In [19]:
days = 6
# get pageviews store them in list
df['list_views_7_days'] = df.apply(lambda row : getViews(row['pagetitle'],row['event_date'],days,language), axis = 1)
df['list_views_7_days_redirects'] = df.apply(lambda row : getViewsRedirects(row['redirects'],row['event_date'],days,language), axis = 1)
df['list_views_before'] = df.apply(lambda row : getViewsBefore(row['pagetitle'],row['page_creation'],row['event_date'],days,language), axis = 1)

In [24]:
#aggregate pageview lists
df['views_before_mean'] = df.apply(lambda row : aggregateViews(row['list_views_before'],np.mean), axis = 1)
df['views_before_sum'] = df.apply(lambda row : aggregateViews(row['list_views_before'],sum), axis = 1)
df['views_before_max'] = df.apply(lambda row : aggregateViews(row['list_views_before'],max), axis = 1)
df['views_before_min'] = df.apply(lambda row : aggregateViews(row['list_views_before'],min), axis = 1)
df['views_before_median'] = df.apply(lambda row : aggregateViews(row['list_views_before'],np.median), axis = 1)
df['views_before_increase'] = df.apply(lambda row : aggregateViews(row['list_views_before'],increase), axis = 1)

df['views_7_days_sum'] = df.apply(lambda row : aggregateViews(row['list_views_7_days'],sum), axis = 1)
df['views_7_days_mean'] = df.apply(lambda row : aggregateViews(row['list_views_7_days'],np.mean), axis = 1)
df['views_7_redirects_sum'] = df.apply(lambda row : aggregateViews(row['list_views_7_days_redirects'],sum), axis = 1)

df["views_7_sum"] = df['views_7_days_sum'] + df['views_7_redirects_sum']

In [None]:
##################################################################################################################

In [168]:
countries = pd.read_csv("countries.csv", delimiter = ',')
def getRegion(country):
    countries = pd.read_csv("countries.csv", delimiter = ',')
    if countries["name"].isin([country]).any():
        row = countries[countries["name"]==country]
        return row.iloc[0]["economic_region"]
    
def getContinent(country):
    countries = pd.read_csv("countries.csv", delimiter = ',')
    if countries["name"].isin([country]).any():
        row = countries[countries["name"]==country]
        return row.iloc[0]["maxmind_continent"]

In [None]:
df["country"].replace({"United States of America": "United States", "People's Republic of China": "China"}, inplace=True)
df["country"].replace({"Hamburg": "Germany"}, inplace=True)
df["country"].replace({"Republic of Artsakh": "Azerbaijan"}, inplace=True)
df["country"].replace({"Turkish Republic of Northern Cyprus": "Cyprus"}, inplace=True)
df["country"].replace({"Chechen Republic": "Russia"}, inplace=True)
df["country"].replace({"The Bahamas": "Bahamas"}, inplace=True)
df["country"].replace({"The Gambia": "Gambia"}, inplace=True)
df["country"].replace({"Danish Realm": "Faroe Islands"}, inplace=True)
df["country"].replace({"North Macedonia": "Macedonia"}, inplace=True)

In [85]:
df['economic_region'] = df.apply(lambda row : getRegion(row['country']), axis = 1)
df['continent'] = df.apply(lambda row : getContinent(row['country']), axis = 1)

In [170]:
print("south: ", df[df['economic_region'] == "Global South"]["cat"].count())
print("north: ", df[df['economic_region'] == "Global North"]["cat"].count())
print("not assigned: ", df[df['economic_region'].isnull() == True]["cat"].count())
df[df['economic_region'].isnull() == True]

south:  1620
north:  4435
not assigned:  16


Unnamed: 0,pagetitle,event_date,economic_region,continent,cat,categories,views_7_sum,views_before_mean,views_before_sum,views_before_max,views_before_min,views_before_median,views_before_increase,list_views_7_days,diff_days,year,planed,surprising,factor,views_7_days_mean
224,2016 Central Tibetan Administration general el...,2015-10-18T00:00:00Z,,,politics,"[Category:2016 elections in Asia, Category:All...",0,0.0,0,0,0,0.0,0,[],149,2015,not_planed,not_surprising,after,0.0
370,2015 Transnistrian parliamentary election,2015-11-29T00:00:00Z,,,politics,"[Category:2015 elections in Europe, Category:2...",2141,0.0,0,0,0,0.0,0,[],-62,2015,planed,not_surprising,before,0.0
631,International Open Data Day,2016-03-05T00:00:00Z,,,,[Category:All articles lacking reliable refere...,651,0.0,0,0,0,0.0,0,"[324, 287, 16, 10, 5, 5, 4]",0,2016,not_planed,not_surprising,,93.0
1758,2016 Transnistrian presidential election,2016-12-11T00:00:00Z,,,politics,"[Category:2016 elections in Europe, Category:2...",957,0.0,0,0,0,0.0,0,[],-161,2016,planed,not_surprising,before,0.0
1975,International Open Data Day,2017-03-04T00:00:00Z,,,,[Category:All articles lacking reliable refere...,485,65.714286,460,118,20,69.0,71,"[250, 85, 53, 39, 19, 22, 17]",-364,2017,planed,not_surprising,before,69.285714
2101,2017 South Ossetian presidential election,2017-04-09T00:00:00Z,,,politics,"[Category:2017 elections in Europe, Category:2...",5184,0.0,0,0,0,0.0,0,[],-25,2017,planed,not_surprising,before,0.0
2592,Tropical Storm Philippe (2017),2017-10-01T00:00:00Z,,,,"[Category:2017 Atlantic hurricane season, Cate...",0,0.0,0,0,0,0.0,0,[],1230,2017,not_planed,not_surprising,after,0.0
3039,International Open Data Day,2018-03-03T00:00:00Z,,,,[Category:All articles lacking reliable refere...,390,55.571429,389,100,25,57.0,75,"[170, 65, 51, 34, 26, 18, 12]",-728,2018,planed,not_surprising,before,53.714286
3080,2018 Ronde van Drenthe (women's race),2018-03-11T00:00:00Z,,,sports,"[Category:2018 UCI Women's World Tour, Categor...",0,0.0,0,0,0,0.0,0,[],35,2018,not_planed,not_surprising,after,0.0
3678,2018 Barkan Industrial Park shooting,2018-10-07T00:00:00Z,,,disaster,"[Category:2018 murders in Asia, Category:All s...",4117,0.0,0,0,0,0.0,0,"[754, 1745, 682, 333, 261, 207, 135]",0,2018,not_planed,not_surprising,,588.142857


In [None]:
##################################################################################################################

In [None]:
#DE:
category_dict = {'preisverleihung': 'culture',
    'award': 'culture',
    'musik': 'culture', 
    'kriminalfall': 'disaster', 
    'anschlag': 'disaster', 
    'unfall': 'disaster',
    'katastrophe': 'disaster', 
    'konflikt': 'disaster', 
    'polit': 'politics',
    'wahl': 'politics', 
    'parlament': 'politics', 
    'fußball': 'sports', 
    'sport': 'sports',
    'wett': 'sports',
    'rennen': 'sports', 
    'marathon': 'sports',
    'turnier': 'sports', 
    'olymp': 'sports', 
    'rallye': 'sports',
    'biathlon': 'sports',
    'championship': 'sports',
    'darts': 'sports'}

In [144]:
def getCategories(pagetitle, language):
    URL = "https://"+language+".wikipedia.org/w/api.php"
    PARAMS = {
        "action": "query",
        "format": "json",
        "prop": "categories",
        "titles": pagetitle,
        "cllimit":"100"
    }
    data = requests.get(url=URL, params=PARAMS).json()
    page = data["query"]["pages"]
    categories = []
    try:
        for key, value in page.items():
            for cat in value["categories"]:
                categories.append(cat["title"])
    except:
        categories = []
    return categories

def filterCategory(categories,pagetitle):
    category = []
    if categories:
        for cat in categories:
            if ("sport" in cat.lower()) | ("sport" in pagetitle.lower()):
                category.append("sports")
            if ("football" in cat.lower()) | ("football" in pagetitle.lower()):
                category.append("sports")
            if ("music" in cat.lower()) | ("music" in pagetitle.lower()):
                category.append("culture")
            if ("film" in cat.lower()) | ("film" in pagetitle.lower()):
                category.append("culture")
            if ("pageants" in cat.lower()) | ("pageants" in pagetitle.lower()):
                category.append("culture")
            if ("award" in cat.lower()) | ("award" in pagetitle.lower()):
                category.append("culture")
            if ("kill" in cat.lower()) | ("kill" in pagetitle.lower()):
                category.append("disaster")
            if ("crime" in cat.lower()) | ("crime" in pagetitle.lower()):
                category.append("disaster")
            if ("controvers" in cat.lower()) | ("controvers" in pagetitle.lower()):
                category.append("disaster")
            if ("murder" in cat.lower()) | ("murder" in pagetitle.lower()):
                category.append("disaster")
            if ("attack" in cat.lower()) | ("attack" in pagetitle.lower()):
                category.append("disaster")
            if ("disaster" in cat.lower()) | ("disaster" in pagetitle.lower()):
                category.append("disaster")
            if ("accidents" in cat.lower()) | ("accidents" in pagetitle.lower()):
                category.append("disaster")
            if ("incident" in cat.lower()) | ("incident" in pagetitle.lower()):
                category.append("disaster")
            if ("conflict" in cat.lower()) | ("conflict" in pagetitle.lower()):
                category.append("disaster")
            if ("war" in cat.lower()) | ("war" in pagetitle.lower()):
                category.append("disaster")
            if ("fire" in cat.lower()) | ("fire" in pagetitle.lower()):
                category.append("disaster")
            if ("earthquakes" in cat.lower()) | ("earthquakes" in pagetitle.lower()):
                category.append("disaster")
            if ("polit" in cat.lower()) | ("polit" in pagetitle.lower()):
                category.append("politics")
            if ("election" in cat.lower()) | ("election" in pagetitle.lower()):
                category.append("politics")
            if ("parliament" in cat.lower()) | ("parliament" in pagetitle.lower()):
                category.append("politics")
            if ("referendum" in cat.lower()) | ("referendum" in pagetitle.lower()):
                category.append("politics")
            if ("missing person" in cat.lower()) | ("missing person" in pagetitle.lower()):
                category.append("disaster")
            if ("shoot" in cat.lower()) | ("shoot" in pagetitle.lower()):
                category.append("disaster")
            if ("nascar" in cat.lower()) | ("nascar" in pagetitle.lower()):
                category.append("sports")
            if ("wrestling" in cat.lower()) | ("wrestling" in pagetitle.lower()):
                category.append("sports")
            if ("olymp" in cat.lower()) | ("olymp" in pagetitle.lower()):
                category.append("sports")
            if ("competition" in cat.lower()) | ("competition" in pagetitle.lower()):
                category.append("sports")
            if ("basketball" in cat.lower()) | ("basketball" in pagetitle.lower()):
                category.append("sports")
            if ("winter games" in cat.lower()) | ("winter games" in pagetitle.lower()):
                category.append("sports")
            if ("cycl" in cat.lower()) | ("cycle" in pagetitle.lower()):
                category.append("sports")
            if ("championship" in cat.lower()) | ("championship" in pagetitle.lower()):
                category.append("sports")
            if ("cup" in cat.lower()) | ("cup" in pagetitle.lower()):
                category.append("sports")
    most_common = Counter(category).most_common(1)
    if len(most_common) > 0:
        return most_common[0][0]
    else:
        return ""

In [171]:
df['categories'] = df.apply(lambda row : getCategories(row['pagetitle'],language), axis = 1)
df['cat'] = df.apply(lambda row : filterCategory(row['categories'],row['pagetitle']), axis = 1)

In [175]:
print("politics: ", df[df['cat'] == "politics"]["cat"].count())
print("sports: ", df[df['cat'] == "sports"]["cat"].count())
print("culture: ", df[df['cat'] == "culture"]["cat"].count())
print("disaster: ", df[df['cat'] == "disaster"]["cat"].count())
print("not assigned: ", df[df['cat'] == ""]["cat"].count())

politics:  2041
sports:  2284
culture:  627
disaster:  1119
not assigned:  276


In [None]:
##################################################################################################################

In [154]:
def getPlanedValue(diff_days):
    if(diff_days < 0):
        return "planed"
    else:
        return "not_planed"
    
def getSurprisingValue(diff_days):
    if(diff_days == 0 | diff_days == 1):
        return "surprising"
    else:
        return "not_surprising"
    
def getThreeFactorValue(diff_days):
    if(diff_days < 0):
        return "before"
    if(diff_days == 0 | diff_days == 1):
        return "surprising"
    if(diff_days > 1):
        return "after"

In [155]:
df['year'] = pd.DatetimeIndex(df['event_date']).year
df['surprising'] = df.apply(lambda row : getSurprisingValue(row['diff_days']), axis = 1)
df['planed'] = df.apply(lambda row : getPlanedValue(row['diff_days']), axis = 1)
df['factor'] = df.apply(lambda row: getThreeFactorValue(row['diff_days']), axis = 1)

In [None]:
##################################################################################################################

In [156]:
# get values for analysis
df = df[['pagetitle','event_date','economic_region','continent','cat','categories','views_7_sum','views_before_mean','views_before_sum','views_before_max','views_before_min','views_before_median','views_before_increase','list_views_7_days','diff_days','year','planed','surprising','factor','views_7_days_mean']]
df.head()

In [157]:
df.to_csv (r'events_dataframe_en.csv', index = False, header=True)