# Questions:
- Question 1:  "Which factors are most associated with Systemic Crises in Africa?"
- Question 2: "At which annual rate of inflation does an Inflation Crisis become a practical certainty?"

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Data Frame

In [None]:
df = pd.read_csv('../input/africa-economic-banking-and-systemic-crisis-data/african_crises.csv')
df.head()

# Exploratory Data Analysis
## 1. Check Data Types

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

## 2. Check Missing Values

In [None]:
df.isnull().sum() # no missing values

## 3. Check the Categorical Features
- Cat features: cc3, country, banking_crisis
- Mode
- Unique Values

In [None]:
cat_list = ['case', 'cc3', 'country', 'year', 'systemic_crisis',
       'domestic_debt_in_default', 'sovereign_external_debt_default',
       'independence','currency_crises', 'inflation_crises', 'banking_crisis']

In [None]:
for col in cat_list:
    print(f'col: {col} unique values:')
    print(df[col].unique())
    print()

Note: Looks like cc3 and country are the same thing. Column "cc3" means country-code. 

In [None]:
# banking_crisis: (no_crisis==0 ; crisis==1)
df['banking_crisis'] = df['banking_crisis'].replace(to_replace='no_crisis', value=0)
df['banking_crisis'] = df['banking_crisis'].replace(to_replace='crisis', value=1)
df.head()

# Visualize the Correlation and Distribution

In [None]:
sns.set_theme(style="white")

# Compute the correlation matrix
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

To answer question one, there is strong negative correlation between systemic crisis and exchange usd. Other useful correlation includes: year, domestic_debt_in_default, and currency_crises. 

# Q1: Feature Importance in order to Classify Systemic Crisis
- Coefficients as Feature Importance
- Decision Tree Feature Importance

## First, we need to normalize the data

In [None]:
df.columns

In [None]:
# (df-df.min())/(df.max()-df.min())

In [None]:
# Normalize column: year, exch_usd, gdp_weighted_default, inflation_annual_cpi, using MinMax scaling
df['year_norm'] = (df.year - df.year.min()) / (df.year.max()-df.year.min())
df['exch_usd_norm'] = (df.exch_usd - df.exch_usd.min()) / (df.exch_usd.max()-df.exch_usd.min())
df['gdp_weighted_default_norm'] = (df.gdp_weighted_default - df.gdp_weighted_default.min()) / (df.gdp_weighted_default.max()-df.gdp_weighted_default.min())
df['inflation_annual_cpi_norm'] = (df.inflation_annual_cpi - df.inflation_annual_cpi.min()) / (df.inflation_annual_cpi.max()-df.inflation_annual_cpi.min())

df.head()

In [None]:
df.columns

In [None]:
features_list = ['case', 'domestic_debt_in_default', 'sovereign_external_debt_default',
       'independence', 'currency_crises', 'inflation_crises', 'banking_crisis', 
       'year_norm','exch_usd_norm', 'gdp_weighted_default_norm',
       'inflation_annual_cpi_norm']

X = df[features_list]

y = df.systemic_crisis

### Logistic Regression Feature Importance

In [None]:
from sklearn.linear_model import LogisticRegression

# Define the Model
lrmodel = LogisticRegression()

# Fit the model
lrmodel.fit(X, y)

# Get Importance
importance = lrmodel.coef_[0]

In [None]:
feature_importance = {'Feature':features_list,
                      'Coef_Score': importance}

feature_importance = pd.DataFrame(feature_importance)
feature_importance = feature_importance.sort_values(by='Coef_Score',ascending=True)
feature_importance

Now we plot the importance

In [None]:
plt.barh(y=feature_importance.Feature, width=feature_importance.Coef_Score)
plt.title('Bar Chart of Logistic Regression Coefficients as Feature Importance Scores')
plt.show()

### Random Forest Feature Importance

In [None]:
from sklearn.ensemble import RandomForestRegressor

RFmodel = RandomForestRegressor()


# Fit the model
RFmodel.fit(X, y)

# Get Importance
RFimportance = RFmodel.feature_importances_

In [None]:
RF_feature_importance = {'Feature':features_list,
                         'Imp_Score': RFimportance}

RF_feature_importance = pd.DataFrame(RF_feature_importance)
RF_feature_importance = RF_feature_importance.sort_values(by='Imp_Score',ascending=True)
RF_feature_importance

In [None]:
plt.barh(y=RF_feature_importance.Feature, width=RF_feature_importance.Imp_Score)
plt.title('Bar Chart of Random Forest Feature Importance')
plt.show()

## Putting those two methods into comparision 

In [None]:
plt.figure(figsize=(12,6))

plt.subplot(1,2,1)
plt.barh(y=feature_importance.Feature, width=feature_importance.Coef_Score)
plt.title('Bar Chart of Logistic Regression Coefficients as Feature Importance Scores')

plt.subplot(1,2,2)
plt.barh(y=RF_feature_importance.Feature, width=RF_feature_importance.Imp_Score)
plt.title('Bar Chart of Random Forest Feature Importance')

plt.tight_layout()
plt.show()

Looks like both of the two methods agreed on Banking Crisis being the most importance facture to influence Systemic Crisis. The second important factor is USD exchange. Year is another factor, so does inflation_annual_cpi. 

**To sum up, these are the features that correlated with Systemic Crisis:**
- banking_crisis
- exch_usd
- year
- inflation_annual_cpi


# Q2: "At which annual rate of inflation does an Inflation Crisis become a practical certainty?"

In [None]:
df.head()

### Take a look at the Inflation Rate and Inflation Crisis in Algeria

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(x=df.year[(df.country=='Algeria')&(df.inflation_crises==1)], 
            y=df.inflation_annual_cpi[(df.country=='Algeria')&(df.inflation_crises==1)],
            c='tomato', label='Inflation Crisis')
plt.scatter(x=df.year[(df.country=='Algeria')&(df.inflation_crises==0)], 
            y=df.inflation_annual_cpi[(df.country=='Algeria')&(df.inflation_crises==0)],
            c='mediumseagreen', label='No Inflation Crisis')

plt.grid()
plt.xticks()

plt.xlabel('Year')
plt.ylabel('Inflation Rate')

plt.title('Inflation Rate and Inflation Crisis of Algeria')
plt.legend()
plt.show()

### Check out all the other countries

In [None]:
country_list = df.country.unique()

plt.figure(figsize=(14,20))
for i in range(len(country_list)):
    plt.subplot(5, 3, i+1)
    
    plt.scatter(x=df.year[(df.country==country_list[i])&(df.inflation_crises==1)], 
                y=df.inflation_annual_cpi[(df.country==country_list[i])&(df.inflation_crises==1)],
                c='tomato', label='Inflation Crisis')
    plt.scatter(x=df.year[(df.country==country_list[i])&(df.inflation_crises==0)], 
                y=df.inflation_annual_cpi[(df.country==country_list[i])&(df.inflation_crises==0)],
                c='mediumseagreen', label='No Inflation Crisis')

    plt.grid()
    plt.xticks()

    plt.xlabel('Year')
    plt.ylabel('Inflation Rate')

    plt.title(f'Inflation Rate and Inflation Crisis of {country_list[i]}')

plt.legend()
plt.tight_layout()
plt.show()
    

Looks like 20 is a good indicator of a inflation crisis happening in Africa countries. However, there are other aspects that influenced the crisis, such as year. Looks like Zimbabwe's data is not accurate or needs future engineering. 