In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import warnings
warnings.simplefilter("ignore")

Goal for this kernel: Handle preprocessing steps,learning Naive Bayes classifier and its variants, nearest neighbour, support vector machines and random forest. 

In [None]:
crises = pd.read_csv("../input/africa-economic-banking-and-systemic-crisis-data/african_crises.csv")

In [None]:
crises.head()

Columns: 
**case** A number which denotes a specific country

**cc3** A three letter country code

**country** The name of the country

**year** The year of the observation

**systemic_crisis** "0" means that no systemic crisis occurred in the year and "1" means that a systemic crisis occurred in the year.

**exch_usd** The exchange rate of the country vis-a-vis the USD

**domestic_debt_in_default** "0" means that no sovereign domestic debt default occurred in the year and "1" means that a sovereign domestic debt default occurred in the year

**sovereign_external_debt_default** "0" means that no sovereign external debt default occurred in the year and "1" means that a sovereign external debt default occurred in the year

**gdp_weighted_default** The total debt in default vis-a-vis the GDP

**inflation_annual_cpi** The annual CPI Inflation rate

**independence** "0" means "no independence" and "1" means "independence"

**currency_crises** "0" means that no currency crisis occurred in the year and "1" means that a currency crisis occurred in the year

**inflation_crises** "0" means that no inflation crisis occurred in the year and "1" means that an inflation crisis occurred in the year

**banking_crisis** "no_crisis" means that no banking crisis occurred in the year and "crisis" means that a banking crisis occurred in the year

In [None]:
crises.shape

In [None]:
crises.describe()

Kumar,D.(2018,Dec 25).Introduction to Data Preprocessing in Machine Learning. Retrieved from: https://towardsdatascience.com/introduction-to-data-preprocessing-in-machine-learning-a9fa83a5dc9d

Data processing Steps:
* Handling Null Values
* Standardization
* Handling Categorical Variables
* One-Hot Encoding
* Multicollinearity

In [None]:
#Checking for null values
crises[crises.isna().any(axis = 1)]

In [None]:
#Checking for null values
crises.isna().sum()

In [None]:
#Checking correlation between factors to systemic_crisis
crises.drop(['case'],axis = 1).corr()


In [None]:
crises.iloc[:,[3,5,6,7,8,9,10,11,12]].apply(lambda x : x.corr(crises['systemic_crisis']))   

The 3 highest correlation with systemic_crisis are sovereign_external_debt_default(0.249850),exch_usd(0.202687),year(0.197450).


In [None]:
#Separating the features and target
x = crises.drop(['case','cc3','country','systemic_crisis'],axis = 1)    #drop irrelevant columns
y = crises['systemic_crisis']#crises.loc[crises['systemic_crisis']]

In [None]:
#Explanatory variables: All variables excluding case,cc3,country,systemic_crisis columns
x

In [None]:
#Dependent variable: systemic_crisis
y

In [None]:
#Convert banking_crisis column into encoding using one hot encoding
#Two different ways of creating dummy variables
#x = pd.get_dummies(x)  #Create two column for banking_crisis(one for crisis, another for no_crisis)
x = pd.get_dummies(x,drop_first = True) #Only one column for banking_crisis(0 = crisis, 1 = no_crisis)
x

pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

drop_first = Whether to get k-1 dummies out of k categorical levels by removing the first level. 


In [None]:
#Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_transformed = scaler.fit_transform(x)    #result as a numpy array
x = pd.DataFrame(x_transformed, index = x.index, columns = x.columns )    #keep as a dataframe
x

Reference:https://datascience.stackexchange.com/questions/45900/when-to-use-standard-scaler-and-when-normalizer

StandardScaler : It transforms the data in such a manner that it has mean as 0 and standard deviation as 1. In short, it standardizes the data. Standardization is useful for data which has negative values. It arranges the data in normal distribution. It is more useful in classification than regression.

Reference:https://stackoverflow.com/questions/23838056/what-is-the-difference-between-transform-and-fit-transform-in-sklearn

1.Fit(): Method calculates the parameters μ and σ and saves them as internal objects.

2.Transform(): Method using these calculated parameters apply the transformation to a particular dataset.

3.Fit_transform(): joins the fit() and transform() method for transformation of dataset.

**Logistic Regression**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.25,random_state = 0)
logReg = LogisticRegression()
logReg.fit(X_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = logReg.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy = " + str(metrics.accuracy_score(y_test,y_pred)))    #sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

In [None]:
print(logReg.coef_)
print(x.columns)
for i in range(len(logReg.coef_[0])):    
    print("Coeeficient of " + x.columns[i] + " : " + str(logReg.coef_[0][i]))

In [None]:
import math
#Convert log-odds into odds
odds = []
for i in range(len(logReg.coef_[0])):
    odds.append(math.exp(logReg.coef_[0][i]))
#print(odds)

#Convert odds into probability
probs = []
for i in range(len(odds)):
    probs.append(odds[i]/(1+odds[i]))
#print(probs)
for i in range(len(probs)):    
    print("Coeeficient of " + x.columns[i] + " : " + str(probs[i]))

Other methods: naive bayes, nearest neighbour, svm, decision trees, boosted trees, random forest, neural network

**Naive Bayes**
https://scikit-learn.org/stable/modules/naive_bayes.html

*GaussianNB* 

*MultinomialNB* 

*ComplementNB*

*BernoulliNB*

*CategoricalNB*

References: 

Implementing 3 Naive Bayes classifiers in scikit-learn. (2018, May 7). Retrieved from: https://hub.packtpub.com/implementing-3-naive-bayes-classifiers-in-scikit-learn/

Das,A.(2018,Sep 25). A comprehensive Naive Bayes Tutorial using scikit-learn. Retrieved from:https://medium.com/@awantikdas/a-comprehensive-naive-bayes-tutorial-using-scikit-learn-f6b71ae84431

Navlani,A. (2018,Dec 4). Naive Bayes Classification using Scikit-learn.Retrieved from:https://www.datacamp.com/community/tutorials/naive-bayes-scikit-learn
Naive Bayes classifier assumes that the effect of a particular feature in a class is independent of other features. For example, a loan applicant is desirable or not depending on his/her income, previous loan and transaction history, age, and location. Even if these features are interdependent, these features are still considered independently. This assumption simplifies computation, and that's why it is considered as naive. This assumption is called class conditional independence.

<img src = "https://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1543836882/image_3_ijznzs.png">
* P(h): the probability of hypothesis h being true (regardless of the data). This is known as the prior probability of h. (Marginal prob)
* P(D): the probability of the data (regardless of the hypothesis). This is known as the prior probability. (Marginal prob of other cond)
* P(h|D): the probability of hypothesis h given the data D. This is known as posterior probability. (Get using the formula above)
* P(D|h): the probability of data d given that the hypothesis h was true. This is known as posterior probability. (Conditional prob)

Multiple features
<img src = "https://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1543836884/image_5_uhsgzr.png">



In [None]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy of GaussianNB = " + str(metrics.accuracy_score(y_test,y_pred)))    #sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

In [None]:
#BernoulliNB
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train,y_train)
y_pred = bnb.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy BernoulliNB = " + str(metrics.accuracy_score(y_test,y_pred)))    #sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

In [None]:
#MultinomialNB
# from sklearn.naive_bayes import MultinomialNB
# mnb = MultinomialNB()
# mnb.fit(X_train,y_train)
# y_pred = mnb.predict(X_test)
# cm = confusion_matrix(y_test,y_pred)
# print(cm)
# print("Accuracy MultinomialNB = " + str(metrics.accuracy_score(y_test,y_pred)))    #sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

#ValueError: Input X must be non-negative

In [None]:
#ComplementNB
# from sklearn.naive_bayes import ComplementNB
# cnb = ComplementNB()
# cnb.fit(X_train,y_train)
# y_pred = cnb.predict(X_test)
# cm = confusion_matrix(y_test,y_pred)
# print(cm)
# print("Accuracy ComplementNB = " + str(metrics.accuracy_score(y_test,y_pred)))    #sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

#ValueError: Input X must be non-negative

In [None]:
# CategoricalNB
# from sklearn.naive_bayes import CategoricalNB
# cnb = CategoricalNB()
# cnb.fit(X_train,y_train)
# y_pred = cnb.predict(X_test)
# cm = confusion_matrix(y_test,y_pred)
# print(cm)
# print("Accuracy CategoricalNB = " + str(metrics.accuracy_score(y_test,y_pred)))    #sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)

#ImportError: cannot import name 'CategoricalNB'

Nearest neighbour

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy knn 5 neighbor : " + str(metrics.accuracy_score(y_test,y_pred)))    #sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)


In [None]:
accuracy_result = []
for i in range(5,30,5):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    accuracy_result.append(metrics.accuracy_score(y_test,y_pred))
print(accuracy_result)


In [None]:
import matplotlib.pyplot as plt
plt.plot([5,10,15,20,25],accuracy_result)

Support vector machine

Reference: https://scikit-learn.org/stable/modules/svm.html
The advantages of support vector machines are:
* Effective in high dimensional spaces.
* Still effective in cases where number of dimensions is greater than the number of samples.
* Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
* Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels. 

The disadvantages of support vector machines include:
* If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
* SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data

Setting C: C is 1 by default and it’s a reasonable default choice. If you have a lot of noisy observations you should decrease it. It corresponds to regularize more the estimation.

LinearSVC and LinearSVR are less sensitive to C when it becomes large, and prediction results stop improving after a certain threshold. Meanwhile, larger C values will take more time to train, sometimes up to 10 times longer, as shown by Fan et al. (2008)

Reference: 

Navlani,A.(2019,Dec 28).Support Vector Machines with Scikit-learn.Retrieved from:https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python


Generally, Support Vector Machines is considered to be a classification approach, it but can be employed in both types of classification and regression problems. It can easily handle multiple continuous and categorical variables. SVM constructs a hyperplane in multidimensional space to separate different classes. SVM generates optimal hyperplane in an iterative manner, which is used to minimize an error. The core idea of SVM is to find a maximum marginal hyperplane(MMH) that best divides the dataset into classes.

<img src = "https://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1526288453/index3_souoaz.png">
**Support Vectors**
Support vectors are the data points, which are closest to the hyperplane. These points will define the separating line better by calculating margins. These points are more relevant to the construction of the classifier.

**Hyperplane**
A hyperplane is a decision plane which separates between a set of objects having different class memberships.

**Margin**
A margin is a gap between the two lines on the closest class points. This is calculated as the perpendicular distance from the line to support vectors or closest points. If the margin is larger in between the classes, then it is considered a good margin, a smaller margin is a bad margin.

The main objective is to segregate the given dataset in the best possible way. The distance between the either nearest points is known as the margin. The objective is to select a hyperplane with the maximum possible margin between support vectors in the given dataset. SVM searches for the maximum marginal hyperplane.

SVM uses a kernel trick to transform the input space to a higher dimensional if some problems cannot be solved in a linear hyperplane. Now can separate the points using linear separation. The SVM algorithm is implemented in practice using a kernel. A kernel transforms an input data space into the required form. SVM uses a technique called the kernel trick. Here, the kernel takes a low-dimensional input space and transforms it into a higher dimensional space. In other words, you can say that it converts nonseparable problem to separable problems by adding more dimension to it. It is most useful in non-linear separation problem. Kernel trick helps you to build a more accurate classifier.

**Parameters:****
* *Kernel:* The main function of the kernel is to transform the given dataset input data into the required form. There are various types of functions such as linear, polynomial, and radial basis function (RBF). Polynomial and RBF are useful for non-linear hyperplane. Polynomial and RBF kernels compute the separation line in the higher dimension. In some of the applications, it is suggested to use a more complex kernel to separate the classes that are curved or nonlinear. This transformation can lead to more accurate classifiers.
* *Regularization: *Regularization parameter in python's Scikit-learn C parameter used to maintain regularization. Here C is the penalty parameter, which represents misclassification or error term. The misclassification or error term tells the SVM optimization how much error is bearable. This is how you can control the trade-off between decision boundary and misclassification term. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane. (The strength of the regularization is inversely proportional to C)
* *Gamma:* A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset, which causes over-fitting. In other words, you can say a low value of gamma considers only nearby points in calculating the separation line, while the a value of gamma considers all the data points in the calculation of the separation line.

In [None]:
from sklearn import svm
#linear kernel
linsvm = svm.SVC(kernel = "linear")
linsvm.fit(X_train,y_train)
y_pred = linsvm.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy linear kernel svm : " + str(metrics.accuracy_score(y_test,y_pred)))

In [None]:
#Different kernels: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
#rbf kernel
rbfsvm = svm.SVC(kernel = "rbf")
rbfsvm.fit(X_train,y_train)
y_pred = rbfsvm.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy rbf kernel svm : " + str(metrics.accuracy_score(y_test,y_pred)))

In [None]:
#Different kernels: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
#poly kernel
polysvm = svm.SVC(kernel = "poly")
polysvm.fit(X_train,y_train)
y_pred = polysvm.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy poly kernel svm : " + str(metrics.accuracy_score(y_test,y_pred)))

In [None]:
#sigmoid kernel
sigsvm = svm.SVC(kernel = "sigmoid")
sigsvm.fit(X_train,y_train)
y_pred = sigsvm.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy sigmoid kernel svm : " + str(metrics.accuracy_score(y_test,y_pred)))

In [None]:
# #precomputed kernel
# presvm = svm.SVC(kernel = "precomputed")
# presvm.fit(X_train,y_train)
# y_pred = presvm.predict(X_test)
# cm = confusion_matrix(y_test,y_pred)
# print(cm)
# print("Accuracy precomputed kernel svm : " + str(metrics.accuracy_score(y_test,y_pred)))

#ValueError: X.shape[0] should be equal to X.shape[1]

In [None]:
#Regularization parameter: C.Must be positive float.
cValues = [0.01,0.2,0.4,0.6,0.8,1.0,1.2,1.4,1.6,1.8,2.0,2.5,3.0,4.0,5.0,6.0]
results = []
for i in range(len(cValues)):
    #poly kernel
    polysvm = svm.SVC(kernel = "poly", C = cValues[i])
    polysvm.fit(X_train,y_train)
    y_pred = polysvm.predict(X_test)
    cm = confusion_matrix(y_test,y_pred)
    print("Accuracy poly kernel svm with C value = " + str(cValues[i]) + " : "+ str(metrics.accuracy_score(y_test,y_pred)))
    results.append(metrics.accuracy_score(y_test,y_pred))
plt.plot(cValues,results)

Random Forest

Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

Navlani,A.(2018,May 16). Understanding Random Forests Classifiers in Python. Retrieved from:https://www.datacamp.com/community/tutorials/random-forests-classifier-python

A forest is comprised of trees. It is said that the more trees it has, the more robust a forest is. Random forests creates decision trees on randomly selected data samples, gets prediction from each tree and selects the best solution by means of voting. It also provides a pretty good indicator of the feature importance.


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("Accuracy Random Forest : " + str(metrics.accuracy_score(y_test,y_pred)))

In [None]:
for i in range(len(clf.feature_importances_)):    
    print("Coeeficient of " + x.columns[i] + " : " + str(clf.feature_importances_[i]))
