In [1]:
import pickle

In [5]:
with open('variables.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    X, y = pickle.load(f)

In [7]:
X.shape

(3340486, 184)

In [8]:
y.shape

(3340486,)

In [10]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import sklearn.model_selection

c0=sns.color_palette()[0]
c1=sns.color_palette()[1]
c2=sns.color_palette()[2]

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

def points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light, 
                cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
    h = .02
    X=np.concatenate((Xtr, Xte))
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    #plt.figure(figsize=(10,6))
    if zfunc:
        p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
        p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        Z=zfunc(p0, p1)
    else:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    ZZ = Z.reshape(xx.shape)
    if mesh:
        plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)
    if predicted:
        showtr = clf.predict(Xtr)
        showte = clf.predict(Xte)
    else:
        showtr = ytr
        showte = yte
    ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold, 
               s=psize, alpha=alpha,edgecolor="k")
    # and testing points
    ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold, 
               alpha=alpha, marker="s", s=psize+10)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    return ax,xx,yy

def points_plot_prob(ax, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light, 
                     cdiscrete=cmap_bold, ccolor=cm, psize=10, alpha=0.1):
    ax,xx,yy = points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=False, 
                           colorscale=colorscale, cdiscrete=cdiscrete, 
                           psize=psize, alpha=alpha, predicted=True) 
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=ccolor, alpha=.2, axes=ax)
    cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)
    plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize=14, axes=ax)
    return ax 

print("Loaded")

Loaded


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [12]:
Xlr, Xtestlr, ylr, ytestlr = train_test_split(X, y, train_size=0.8, random_state=0)

In [13]:
print("\n")
print("Xlr:", Xlr, type(Xlr), Xlr.shape, len(Xlr)) #TrainX

print("\n")
print("Xtestlr", Xtestlr, type(Xtestlr), Xtestlr.shape, len(Xtestlr)) #TestX 


print("\n")
print("ylr", ylr, type(ylr), ylr.shape, len(ylr)) #Trainy


print("\n")
print("ytestlr", ytestlr, type(ytestlr), ytestlr.shape, len(ytestlr)) #Testy



Xlr: [[-1.88446705e-01  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 7.83166225e-17  1.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.16690797e-01  0.00000000e+00  1.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [-2.27476079e-01  1.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.23525781e-01  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.26754907e-01  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]] <class 'numpy.ndarray'> (2672388, 184) 2672388


Xtestlr [[-0.22749761  1.          0.         ...  0.          0.
   0.        ]
 [-0.11889132  1.          0.         ...  0.          0.
   0.        ]
 [-0.2269056   0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.01278223  1.          0.         ...  0.          0.
   0.        ]
 [-

In [14]:
# construct the LogisticRegression model
clf = LogisticRegression(multi_class='multinomial', solver ='sag', max_iter = 25, verbose = 1, n_jobs = -1)

In [15]:
# Fit the model on the training data.
clf.fit(Xlr, ylr.ravel()) 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 1205 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 20.2min finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=25,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=1,
                   warm_start=False)

In [16]:
# predict on training set
y_predict_training = clf.predict(Xlr)
    
# predict on test set
y_predict_test = clf.predict(Xtestlr)

In [17]:
# use sklearn.metrics.classification_report for a more comprehensive
# performance analysis


from sklearn.metrics import classification_report
# ref: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report

print("[Training Classification Report]")
print(classification_report(ylr, y_predict_training))

print("[Test Classification Report]")
print(classification_report(ytestlr, y_predict_test))

[Training Classification Report]


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

          AU       0.00      0.00      0.00      5966
          CA       0.47      0.03      0.06     15104
          DE       1.00      1.00      1.00     11539
          ES       1.00      1.00      1.00     33134
          FR       1.00      1.00      1.00     72358
          GB       0.00      0.00      0.00     31655
          IT       1.00      1.00      1.00     47985
         NDF       0.90      1.00      0.95   1467019
          NL       1.00      1.00      1.00     10267
          PT       1.00      1.00      1.00      2804
          US       0.94      1.00      0.97    810212
       other       0.59      0.01      0.02    164345

    accuracy                           0.92   2672388
   macro avg       0.74      0.67      0.67   2672388
weighted avg       0.88      0.92      0.88   2672388

[Test Classification Report]
              precision    recall  f1-score   support

          AU       0.00      0.00      0.00      

__CONCLUSIONS__

__1. As seen in the classification report above, in terms of the precision, which shows what percent of the predictions were correct, the model did a good job for DE, ES, FR, IT, NDF, NL, PT, US, and other classes while it didn't do a good job for AU, CA, and GB.__

__2. Similarly, the model correctly predicted the positive observations among all observations for DE, ES, FR, IT, NDF, NL, PT, and US while did a very bad job predicting positive observations for other countries.__ 

__3. In terms of F1 score, again, the model did a good job predicting the positive observations correctly for DE, ES, FR, IT,NDF, NL, PT, and US.__