# List of Riots Data Extraction from Wikipedia.org 

In [1]:
import pandas as pd
import wikipedia as wp

#### Getting the data and splitting it in the lines

In [2]:
text_data = list(wp.page("List of riots").content.splitlines())

#### Cleaning the Data

In [3]:
years = ['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
    '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']
months = ['January', 'January:', 'February', 'February:', 'March', 'March:', 'April', 'April:',
          'May', 'May:', 'June', 'June:', 'July', 'July:', 'August', 'August:', 'September',
          'September:', 'October', 'October:', 'November', 'November:', 'December', 'December:']
el = []

In [4]:
clean_text_data = list(filter(None, text_data))        

In [5]:
for item in clean_text_data:
    tl = list(item.split(' '))
    if tl[0] in years:
        el.append(item)
    elif tl[0] == "====":
        year = tl[1]
    elif tl[0] in months:
        el.append(year + " - " + item)

In [6]:
tly = []
tlt = []
for item in el:
    tl = list(item.split(' '))
    tly.append(tl[0])
    tlt.append(tl[2:])

In [7]:
tlt2 = []
for sentences in tlt:
    tlt2.append([[' '.join(i)] for i in tlt])

In [8]:
import itertools
tlt3 = list(itertools.chain.from_iterable(tlt2))
tlt4 = list(itertools.chain.from_iterable(tlt3))

So Finally we have the years list tly and the text list tlt4. Now we will make a dataframe out of these both

In [9]:
df = pd.DataFrame(list(zip(tly, tlt4)), 
               columns =['Year', 'Text']) 

In [10]:
df.tail()

Unnamed: 0,Year,Text
382,2020,"August 29: Riots in Malmö, Sweden: about 300 p..."
383,2020,"September 25: DR Congo jail riots, mass rape o..."
384,2020,"October 25: Riots in Nigeria, at least 12 peop..."
385,2020,"October 29: Prison riots in Herat, Afghanistan..."
386,2020,December 7-9: Riots in Indonesia by followers ...


#### Using NLP to extract the countries and other important information

In [11]:
import en_core_web_sm
nlp = en_core_web_sm.load()



In [12]:
text = nlp(df.Text[0])
print(text)
for ent in text.ents:
    if ent.label_ == "GPE":
        print(ent.text+' - '+ent.label_)

Stanley Cup Western Conference Finals (Edmonton Oilers victory), May 2006, Edmonton, Alberta, Canada
Edmonton - GPE
Alberta - GPE
Canada - GPE


In [13]:
df['Entities'] = df['Text'].apply(lambda x: list(nlp(x).ents))

In [14]:
from geotext import GeoText
places = GeoText(df.Text[0])

In [15]:
df['Countries'] = df['Text'].apply(lambda x: list(GeoText(x).countries))

In [16]:
df['Cities'] = df['Text'].apply(lambda x: list(GeoText(x).cities))

In [17]:
l = ['Year', 'Text', 'Entities', 'Countries', 'Cities']
df[l] = df[l].astype('str')

#### Cleaning the text data after extraction

In [18]:
# Importing regular expressions
import re

In [19]:
def cleanData(series):
    return series.str.lower().str.replace(r"[^a-z0-9\s]", "").str.strip()    

In [20]:
for i in l:
    df[i] = cleanData(df[i])

In [21]:
df.head()

Unnamed: 0,Year,Text,Entities,Countries,Cities
0,2006,stanley cup western conference finals edmonton...,stanley cup western conference finals edmonton...,canada,edmonton
1,2006,cartoon riots,,,
2,2006,2006 nukualofa riots november 16 nukualofa tonga,2006 november 16 tonga,tonga,
3,2006,2006 dublin riots february 25 dublin ireland,2006 dublin february 25 dublin ireland,ireland,dublin dublin
4,2006,san bernardino punk riot march 4 san bernardin...,san bernardino march 4 san bernardino california,,san bernardino march san bernardino


In [23]:
df.to_excel('riotsListWiki.xlsx', engine='xlsxwriter')

Now we have to manually check for the missing countries which have been missed using excel.