# Trade agreement classifier using Support Vector Machine and XGBoost
### Follow [Example 1](https://github.com/shreyans29/thesemicolon/blob/master/Text%20Analytics%20tfidf.ipynb), [Example 2](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)  ([Youtube](https://www.youtube.com/watch?v=bPYJi1E9xeM))
### Also see [countvectorizer example](http://adataanalyst.com/scikit-learn/countvectorizer-sklearn-example/) 

In [1]:
# from sklearn.datasets import fetch_20newsgroups
# twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [2]:
import os
import scipy
import eli5
from eli5.lime import TextExplainer
from eli5.sklearn import PermutationImportance

from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, make_pipeline

from fastai.text import *

import pandas as pd
from pandas import Series
from pandas import crosstab as tab

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from xgboost import XGBClassifier

path = "/Users/renjiege/Dropbox/DEEP PTA - SEC2/quantification/sec2_parse"
file = 'test.dta'
full_file = os.path.join(path, file)
df = pd.read_stata(full_file)

In [3]:
import numpy as np
df = df.filter(items=['TypeofRestriction', 'MeasureText'])
df['type1'] = Series(df.TypeofRestriction.str.contains('MFN', regex=True))
df['type1_num'] = np.where(df['type1']==True,1,0)

In [4]:
tab(df.type1,'')

type1
False    1519
True      655
Name: __dummy__, dtype: int64

In [5]:
tf = TfidfVectorizer(min_df=1,stop_words='english')
tf_transformer = TfidfTransformer()
cv = CountVectorizer(stop_words='english')

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df.MeasureText, df.type1_num, test_size=0.2, random_state=4)

In [7]:
CLAS_PATH = Path('/Users/renjiege/Documents/data/fastai/imdb_clas/')

In [8]:
col_names = ['labels','text']
df_trn = pd.DataFrame({'text':x_train, 'labels':y_train}, columns=col_names)
df_val = pd.DataFrame({'text':x_test, 'labels':y_test}, columns=col_names)

df_trn.to_csv(CLAS_PATH/'train.csv', header=False, index=False)
df_val.to_csv(CLAS_PATH/'test.csv', header=False, index=False)

In [9]:
df.MeasureText.size

2174

In [10]:
x_train.size

1739

In [11]:
x_train.values

array(['"Chile reserves the right to adopt or maintain any measure with respect to the provision of public law enforcement and correctional services, and the following services to the extent that they are social services established or maintained for a public purpose: income security or insurance, social security or insurance, social welfare, public education, public training, health, and child care."\n',
       '"Investment\n\nThe operator of Mount Isa Mines shall, so far as is reasonably and economically practicable:\n  (a) use the services of professional consultants resident and available within Queensland;\n  (b) use labour available within Queensland;\n  (c) when preparing specifications, calling for tenders and letting contracts for works, materials, plant, equipment and supplies ensure that Queensland suppliers, manufacturers, and contractors are given reasonable opportunity to tender or quote; and\n  (d) give proper consideration and where possible preference to Queensland sup

In [12]:
df.MeasureText[1]

'"Only qualified lawyers who are citizens of Bahrain or the Gulf Cooperation Council (GCC) Member States and licensed by the Bahraini authorities may supply legal services in the territory of Bahrain, including representing clients before law courts, arbitration tribunals, police departments, and administrative commissions of a judicial nature in the territory of Bahrain.  \n\nNotwithstanding paragraph 1, non-Bahraini and non-GCC lawyers, resident in Bahrain, may supply legal services in the territory of Bahrain, other than representing clients before law courts, arbitration tribunals, police departments, and administrative commissions of a judicial nature, if employed by a lawyer licensed to practice law in Bahrain.\n\nFor greater certainty, legal consultants of the other Party, either as individuals or firms, may supply legal services in Bahrain, with respect to the laws of countries other than Bahrain, upon obtaining a license from the Bahraini authorities, if the individual providi

In [13]:
# x_train_tfidf.toarray()

In [14]:
tf.fit(df.MeasureText)
x_train_tf = tf.transform(x_train)
x_test_tf = tf.transform(x_test)

In [15]:
SVD = TruncatedSVD(algorithm='randomized', n_components=300)

In [12]:
x_train_SVD = SVD.fit_transform(x_train_tf)
x_test_SVD = SVD.fit_transform(x_test_tf)

#### Multinomial Naive Bayes and Random Forest Classifier

In [44]:
NB = MultinomialNB()
RF = RandomForestClassifier(n_estimators=100)

#### Support vector machine Classifier

In [34]:
SVM = SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                         max_iter=5, tol=None)

#### XGBoost Classifier

In [27]:
xgboost = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)

### Find which feature matters. First, build a pipeline

In [17]:
pipe = make_pipeline(tf, xgboost)

In [82]:
pipe.fit(x_train,y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [83]:
eli5.show_weights(pipe)

Weight,Feature
0.0461,medida
0.0461,reciprocity
0.0444,accords
0.0258,reserves
0.0243,derecho
0.0240,real
0.0239,actual
0.0196,bilateral
0.0191,case
0.0188,forma


In [67]:
te = TextExplainer(random_state=42)
doc= df.MeasureText[88]
te.fit(doc, pipe.predict_proba)
te.show_prediction()

Contribution?,Feature
1.666,Highlighted in text (sum)
0.805,<BIAS>


In [68]:
df.TypeofRestriction[88]

'NT/Nationality reqts.+ MA/No. Nat. Pers.'

### Classification

In [28]:
text_clf = xgboost.fit(x_train_tf,y_train)

In [29]:
x_train_tf.shape

(1739, 6194)

In [30]:
x_test_tf.shape

(435, 6194)

In [31]:
predicted = text_clf.predict(x_test_tf)
np.mean(predicted == y_test)

0.9402298850574713

In [32]:
tab(df.type1_num,'', normalize=1)

type1_num
0    0.698712
1    0.301288
Name: __dummy__, dtype: float64

## Stemming Examples

In [240]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

analyzer = tf.build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))
stem_vectorizer = TfidfVectorizer(analyzer=stemmed_words)

stem_vectorizer.fit_transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"]).toarray()
# stem_vectorizer.get_feature_names()

array([[0.        , 0.79596054, 0.60534851, 0.        ],
       [0.        , 0.        , 1.        , 0.        ],
       [0.70710678, 0.        , 0.        , 0.70710678]])

In [241]:
stem_vectorizer.fit(df.MeasureText)
x_train_tf = stem_vectorizer.transform(x_train)
x_test_tf = stem_vectorizer.transform(x_test)
text_clf = MultinomialNB().fit(x_train_tf, y_train)

In [242]:
predicted = text_clf.predict(x_test_tf)
np.mean(predicted == y_test)

0.8735632183908046