In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install calplot

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels as sm
from statsmodels.graphics.mosaicplot import mosaic
import nltk 
import spacy
from spacy.matcher import PhraseMatcher
import calplot as cplt
import plotly.offline as py
import plotly.express as px
import cufflinks as cf

In [None]:
sns.set()
%matplotlib inline

In [None]:
file='/kaggle/input/africa-conflict-19972020/Africa_1997-2020_Jan08.csv'

In [None]:
africa = pd.read_csv(file, sep=';')

# cleaning and prepare data

In [None]:
africa.tail()

In [None]:
africa.info()

In [None]:
africa.isnull().sum()[africa.isnull().sum()>0]

In [None]:
africa.drop(columns=['ASSOC_ACTOR_1', 'ASSOC_ACTOR_2', 'ADMIN3', 'TIMESTAMP',
                     'EVENT_ID_NO_CNTY', 'EVENT_ID_CNTY', 'ISO'], inplace=True)

In [None]:
africa.isnull().sum()[africa.isnull().sum()>0]

In [None]:
africa.ACTOR2.fillna(' ', inplace=True)

In [None]:
africa.dropna(inplace=True)

In [None]:
africa['LONGITUDE'] = pd.to_numeric(africa['LONGITUDE'], errors='coerce')
africa['LATITUDE'] = pd.to_numeric(africa['LATITUDE'], errors='coerce')

### Clean event date

In [None]:
def proper_date(row):
    month = {'janvier': 1, 'février':2, 'mars':3, 'avril':4, 'mai':5,
            'juin':6, 'juillet':7, 'août':8, 'septembre':9, 'octobre':10,
            'novembre':11, 'décembre':12}
    c = row.split('-')
    date = f'{c[0]}/{month[c[1]]}/{c[2]}'
    return date

In [None]:
africa['EVENT_DATE'] = africa['EVENT_DATE'].apply(proper_date)

In [None]:
africa['EVENT_DATE'] = pd.to_datetime(africa['EVENT_DATE'])

In [None]:
africa.info()

## Explore

In [None]:
africa['EVENT_TYPE'].value_counts()

In [None]:
africa['SUB_EVENT_TYPE'].value_counts()

In [None]:
africa['ACTOR1'].value_counts()

In [None]:
africa['ACTOR2'].value_counts()

In [None]:
africa['REGION'].value_counts()

In [None]:
africa['COUNTRY'].value_counts()

In [None]:
africa['ADMIN1'].value_counts()

In [None]:
africa['ADMIN2'].value_counts()

In [None]:
africa['LOCATION'].value_counts()

In [None]:
africa['SOURCE_SCALE'].value_counts()

In [None]:
africa['SOURCE'].value_counts()

In [None]:
africa['TIME_PRECISION'].value_counts()

In [None]:
africa['GEO_PRECISION'].value_counts()

In [None]:
obj_cols = [u  for u in africa.columns if (africa[u].dtype =='object') and (u != 'NOTES')]
int_cols = [u  for u in africa.columns if africa[u].dtype =='int']
float_cols = [u  for u in africa.columns if africa[u].dtype =='float']

In [None]:
print(f'Object columns: {obj_cols}\n\nInteger columns: {int_cols}\n\nFloat columns: {float_cols}')

# Exploratory data analysis

In [None]:
africa[['YEAR','LATITUDE','LONGITUDE','FATALITIES']].describe()

In [None]:
africa[['YEAR','LATITUDE','LONGITUDE','FATALITIES']].corr()

In [None]:
africa.mode()

In [None]:
africa.skew()

In [None]:
africa.kurtosis()

In [None]:
africa1 = africa[africa['TIME_PRECISION'] == 1]

## Visualization

In [None]:
fig = plt.figure(figsize=(20,10))
fig.subplots_adjust(wspace=0.3, hspace=0.5)
for i, u in enumerate(int_cols+float_cols):
    ax = fig.add_subplot(3,3,i+1)
    sns.histplot(x=u, bins=20, data=africa)
    ax.set_title('Histogram: ' +u)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
fig.subplots_adjust(wspace=0.3, hspace=0.5)
for i, u in enumerate(int_cols+float_cols):
    ax = fig.add_subplot(3,3,i+1)
    sns.boxplot(x=u, data=africa)
    ax.set_title('Boxplot: ' +u)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.pie(africa['EVENT_TYPE'].value_counts(), labels=africa['EVENT_TYPE'].value_counts().index, startangle=90, shadow=True)
plt.title('Pie plot.')
plt.legend(title="EVENT TYPE:", loc='upper left')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(5,20))
mosaic(africa['SOURCE_SCALE'].value_counts(), horizontal=False, axes_label=False, ax=ax, title='1997-2020 Mosaic')
plt.show()

# Other visualization

## Relation between INTER1 (ACTOR1) and INTER2 (ACTOR2)

In [None]:
interaction = pd.pivot_table(africa, values='INTERACTION', index='INTER2', columns='INTER1', aggfunc=np.count_nonzero)

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(interaction, center=0, annot=True, fmt='.5g')
plt.title('Number of conflict INTER1 vs INTER2 between 1997 to 2020.')
plt.xlabel('INTER1 (ACTOR1)')
plt.ylabel('INTER2 (ACTOR2)')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(interaction.corr()))
sns.heatmap(interaction.corr(), center=0, annot=True, fmt='0.3g', mask=mask)
plt.title('Similarity conflict ACTOR1 vs ACTOR2 between 1997 to 2020.')
plt.xlabel('INTER1 (ACTOR1)')
plt.ylabel('INTER2 (ACTOR2)')
plt.show()

In [None]:
fatalities_interaction =  pd.pivot_table(africa, values='FATALITIES', index='INTER2', columns='INTER1', aggfunc=np.sum)

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(fatalities_interaction, center=0, annot=True, fmt='.6g')
plt.title('Fatalities caused by conflict ACTOR1 vs ACTOR2 between 1997 to 2020.')
plt.xlabel('INTER1 (ACTOR1)')
plt.ylabel('INTER2 (ACTOR2)')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
mask1 = np.triu(np.ones_like(fatalities_interaction.corr()))
sns.heatmap(fatalities_interaction.corr(), center=0, annot=True, fmt='0.3g', mask=mask1)
plt.title('Similarity fatalities ACTOR1 vs ACTOR2 between 1997 to 2020.')
plt.xlabel('INTER1 (ACTOR1)')
plt.ylabel('INTER2 (ACTOR2)')
plt.show()

## In each region, how many fatalities the actors make in the conflict between 1997 to 2020?

In [None]:
region_actor = pd.pivot_table(africa, values='FATALITIES', index='INTER1', columns='REGION', aggfunc=np.sum)

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(region_actor, center=0, annot=True, fmt='.6g')
plt.title('Fatalities in each region caused by ACTOR1 between 1997 to 2020.')
plt.ylabel('ACTOR1')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
mask2 = np.triu(np.ones_like(region_actor.corr()))
sns.heatmap(region_actor.corr(), center=0, annot=True, fmt='0.3g', mask=mask2)
plt.title('Similarity fatalities caused by actor1 between region of Africa.')
plt.show()

## In each type of conflict event, how many fatalities the actors make between 1997 to 2020?

In [None]:
event_actor = pd.pivot_table(africa, values='FATALITIES', index='INTER1', columns='EVENT_TYPE', aggfunc=np.sum)

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(event_actor, center=0, annot=True, fmt='.6g')
plt.title('Fatalities in each event_type caused by ACTOR1 between 1997 to 2020.')
plt.ylabel('ACTOR1')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
mask3 = np.triu(np.ones_like(event_actor.corr()))
sns.heatmap(event_actor.corr(), center=0, annot=True, fmt='0.3g', mask=mask3)
plt.title('Similarity fatalities caused by actor1 between different event_type.')
plt.show()

## In each region, how many fatalities are there in each type of conflict event between 1997 to 2020?

In [None]:
event_region = pd.pivot_table(africa, values='FATALITIES', index='REGION', columns='EVENT_TYPE', aggfunc=np.sum)

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(event_region, center=0, annot=True, fmt='.6g')
plt.title('Fatalities in each region by event_type between 1997 to 2020.')
plt.show()

# Geospatial Analysis

In [None]:
middle_africa = africa[africa['REGION'] == 'Middle Africa']

In [None]:
middle_africa.head()

In [None]:
middle_africa['EVENT_TYPE'].value_counts()

In [None]:
middle_africa['SUB_EVENT_TYPE'].value_counts()

In [None]:
middle_africa['COUNTRY'].value_counts()

In [None]:
middle_africa['INTER1'].value_counts()

In [None]:
middle_africa['INTERACTION'].value_counts()

In [None]:
middle_africa['ACTOR1'].value_counts()[middle_africa['ACTOR1'].value_counts()>100]

In [None]:
need_cols = ["ACTOR1", "ACTOR2", "REGION", "COUNTRY", "ADMIN1", "ADMIN2",
             "LOCATION", "EVENT_TYPE", "SUB_EVENT_TYPE", "FATALITIES", "YEAR",
             "LONGITUDE","LATITUDE", "INTERACTION"]

In [None]:
ext_data = africa[need_cols]

In [None]:
year_data = ext_data[ext_data['YEAR'] == 2020]

In [None]:
center_point = dict(lon=3, lat=11)
figx = px.density_mapbox(ext_data, lat='LATITUDE', lon='LONGITUDE', z="FATALITIES",
                        center = center_point, hover_name='COUNTRY', zoom = 5,
                         range_color= [20, 20] , radius=20,
                        mapbox_style= 'open-street-map', title='Conflict in Africa between 1997 and 2020',
                        animation_frame='YEAR')
figx.update(layout_coloraxis_showscale=True)
figx.show()

# Text mining

In [None]:
def tokenizer(text):
    
    def is_token_allowed(token):
        if(not token or token.is_stop or token.is_punct):
            return False
        return True
    
    def preprocess_token(token):
        return token.lemma_.strip().lower()
    
    nlp = spacy.load("en_core_web_sm")
    
    doc = nlp(text)
    
    filtered_token = [preprocess_token(token) for token in doc if is_token_allowed(token)]
    
    return filtered_token       

In [None]:
year_country = africa[(africa['YEAR'] == 2008) & (africa['COUNTRY'] == 'Cameroon')]

In [None]:
text1 = ' '.join(year_country['NOTES'])
text = tokenizer(text1)

In [None]:
source = ' '.join(year_country['SOURCE'])

In [None]:
event = ' '.join(year_country['EVENT_TYPE'])

In [None]:
actors = ' '.join(year_country['ACTOR1'] + ', ' + year_country['ACTOR2']) 

## Text statistics

In [None]:
print(f'Lenght of the text: {len(text)}.')

In [None]:
print(f'Lexical richness of the text: {len(text)/len(set(np.unique(text)))}.')

In [None]:
#wrd = "biya"
#print(f'Count word: {text.count(wrd)}.\nPercentage: {100 * text.count(wrd) / len(text)}')
#plt.figure(figsize=(15,5))

In [None]:
fdist = nltk.FreqDist(text)

In [None]:
fdist

In [None]:
popular_word = list(set(fdist.keys()) - set(fdist.hapaxes()))

In [None]:
pop_word = pd.Series({w:fdist[w] for w in sorted(popular_word)})

In [None]:
fig, ax = plt.subplots(figsize=(10,15))
mosaic(pop_word.sort_values(), ax=ax, horizontal=False, title='Popular words in the Note.')
plt.show()

## Collocation

In [None]:
bigrams = nltk.bigrams(text)

In [None]:
cfd_b = nltk.FreqDist(bigrams)

In [None]:
fig1, ax1 = plt.subplots(figsize=(5, 10), dpi=100)
mosaic({x[0][0]+' '+x[0][1]: x[1] for x in cfd_b.most_common(n=30)}, ax=ax1,  axes_label=False,
       horizontal=False, title='30 most common bigrams.')
plt.show()

In [None]:
trigrams = nltk.trigrams(text)

In [None]:
cfd_t = nltk.FreqDist(trigrams)

In [None]:
cfd_t

In [None]:
cdic={x[0][0] + ' ' + x[0][1]+' '+x[0][2]: x[1] for x in cfd_t.most_common(n=30)}

In [None]:
fig2, ax2 = plt.subplots(figsize=(8, 15), dpi=100)
mosaic(cdic, ax=ax2,  axes_label=False,
       horizontal=False, title='30 most common trigrams.')
plt.show()

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(text1)

In [None]:
spacy.displacy.render(doc, style='ent')

In [None]:
matcher = PhraseMatcher(nlp.vocab)
terms = ["high cost of living"]
patterns = [nlp.make_doc(t) for t in terms]
matcher.add("TerminologyList", None, *patterns)
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start-15:end+15]
    spacy.displacy.render(span, style='ent', jupyter=True)

# Africa: region

In [None]:
sub_region = africa[(africa['YEAR'] == 2020) & (africa['REGION'] == 'Middle Africa')]

In [None]:
sub_region.tail()

In [None]:
sns.countplot(x='INTER1', data=sub_region)
plt.show()

In [None]:
_, ax = plt.subplots(figsize=(10,5))
sub_region.ACTOR1.value_counts()[:10].plot(kind='bar', ax=ax, title='10 forces most present in the Middle Africa')
plt.show()

In [None]:
_, ax = plt.subplots(figsize=(10,5))
sub_region.COUNTRY.value_counts().plot(kind='bar', ax=ax, title='Countries in the Middle Africa.')
plt.show()

In [None]:
_, ax = plt.subplots(figsize=(10,5))
sub_region.EVENT_TYPE.value_counts().plot(kind='bar', ax=ax, title='Countries in the Middle Africa.')
plt.show()

In [None]:
fatalities_per_country = sub_region.groupby('COUNTRY')["FATALITIES"].agg('sum')

In [None]:
_, ax = plt.subplots(figsize=(10,5))
fatalities_per_country.plot(kind='bar', ax=ax, title='Total fatalities per country ')
ax.set_ylabel('FATALITIES')
plt.show()

## TIME SERIES

In [None]:
cal_fatalities = sub_region.groupby('EVENT_DATE')['FATALITIES'].agg('sum')

In [None]:
_, ax = plt.subplots(figsize=(15,5))
cal_fatalities.plot(ax=ax)
ax.set_title('Daily fatalities.')
plt.show()

In [None]:
weekly_fatalities = cal_fatalities.resample('W').sum()
monthly_fatalities = cal_fatalities.resample('M', convention='end').sum()
Q_fatalities = cal_fatalities.resample('Q', convention='start').sum()

In [None]:
_, ax = plt.subplots(figsize=(15,5))
weekly_fatalities.plot(ax=ax)
ax.set_title('Weekly fatalities.')
plt.show()

In [None]:
_, ax = plt.subplots(figsize=(15,5))
monthly_fatalities.plot(ax=ax)
ax.set_title('Monthly fatalities.')
plt.show()

In [None]:
_, ax = plt.subplots(figsize=(15,5))
Q_fatalities.plot(kind='bar', ax=ax)
ax.set_title('Quarterly fatalities.')
plt.show()

In [None]:
_, ax = plt.subplots(figsize=(15,5))
cal_fatalities.diff().plot(ax=ax)
ax.set_title('Change rate fatalities.')
plt.show()

In [None]:
cplt.calplot(cal_fatalities, how=None, figsize=(15,5))
ax.set_title('Fatalities calendar for Middle Africa region.')
plt.show()