# Test featurize

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# sklearn classification
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

# sklearn general
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import (accuracy_score,
                             confusion_matrix, 
                             classification_report, 
                             f1_score, 
                             precision_score,
                             recall_score)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier


from stop_words import get_stop_words
import ujson as json

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from app.utils import (clean_boilerplate, 
                       clean_string,
                       clean_website, 
                       detect_XML, 
                       extract_meta_informations,
                       reduce_whitespace,
                       remove_special_characters,
                       remove_tags, 
                       tokenizing_html, 
                       trim_html)

In [393]:
%%time
train = pd.read_csv("../data/ctrain.csv", nrows=1000).fillna("")
train = train.fillna("")
test = pd.read_csv("../data/ctest.csv", nrows=200).fillna("")
test = test.fillna("")
train.head(1)

CPU times: user 2.47 s, sys: 234 ms, total: 2.7 s
Wall time: 2.7 s


Unnamed: 0,url,industry,industry_label,group,group_representative,html,text,source,country,group_representative_label,meta,chtml
0,http://12-18.com,31,Hospitality,"rec, serv, tran",30,"<!DOCTYPE html>\n<html lang=""de"">\n<head>\n\t<...",12.18. Investment Management - ANDERS. AUS PRI...,xing,DE,"Leisure, Travel & Tourism",Ein glückliches Investment ist das Resultat ha...,<html>\n<head>\n\t<title>12.18. Investment Man...


In [306]:
def get_feature_mtx(data, vectorizer=None, vectorizer2=None):
    """ """
    mtx = []
    plain_text = data.text
    meta_text = data.meta
    
    # TF-IDF counts as features
    if vectorizer:
        plain_vector = vectorizer.transform(plain_text)#.toarray().tolist()
        meta_vector = vectorizer2.transform(meta_text)#.toarray().tolist()
        plain_vectorizer, meta_vectorizer = _, _
        
    else:
        plain_vectorizer = TfidfVectorizer()
        meta_vectorizer = TfidfVectorizer()
        
        plain_vector = plain_vectorizer.fit_transform(plain_text)#.toarray().tolist()
        meta_vector = meta_vectorizer.fit_transform(meta_text)#.toarray().tolist()
        
    #mtx.extend(plain_vector)
    #return np.array(mtx), plain_vectorizer, meta_vectorizer
    #print(plain_vector)
    #mtx.extend(meta_vector)
    #mtx.extend([0.66666])
    #mtx.append(plain_vector)
    #mtx = list(map(list, zip(*mtx)))
    #return np.array(mtx), plain_vectorizer, meta_vectorizer
    
    print(plain_vector.toarray())
    print(plain_vector.shape)
    
    
    mtx = sp.hstack([plain_vector, meta_vector])
    return mtx, plain_vectorizer, meta_vectorizer

In [303]:
l = [1, 2, 3, 4]
np.reshape(l, (4, 1))

array([[1],
       [2],
       [3],
       [4]])

In [296]:
X_train.shape

(100, 16631)

In [287]:
len(X_train[0]), len(X_train[10])

(15565, 15565)

In [258]:
from scipy import sparse as sp

counts = CountVectorizer().fit_transform(train['text'].values)
ones = np.ones(shape=(len(train), 1))
X = sp.hstack([counts, ones])

In [261]:
from scipy import sparse as sp

counts = CountVectorizer().fit_transform(train['text'].values)
count2 = CountVectorizer().fit_transform(train['meta'].values)
X = sp.hstack([counts, count2])

In [262]:
X

<10x3448 sparse matrix of type '<class 'numpy.int64'>'
	with 4768 stored elements in COOrdinate format>

In [257]:
len(train)

10

In [255]:
X.toarray()

array([[0., 0., 3., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

### Training

In [307]:
X_train, vectorizer, vectorizer2 = get_feature_mtx(train)
y_train = train.group_representative_label
X_test, _, _ = get_feature_mtx(test, vectorizer=vectorizer, vectorizer2=vectorizer2)
y_test = test.group_representative_label

[[0.02203047 0.01468698 0.00734349 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.04021404 0.02010702 0.02010702]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(3, 1489)
[[0. 0. 0. ... 0. 0. 0.]]
(1, 1489)


In [340]:
train.iloc[0].country

'DE'

http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

In [346]:
from sklearn.base import TransformerMixin
import pandas as pd

class CountryTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        countries = pd.DataFrame(X['country'].apply(lambda x: len(x)))
        return countries

    def fit(self, X, y=None, **fit_params):
        return self

In [347]:
CountryTransformer().transform(train)

Unnamed: 0,country
0,2
1,2
2,2


In [435]:
from sklearn.pipeline import FeatureUnion, Pipeline

def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)

class DataFrameColumnExtracter(TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

    
plain_text = Pipeline([
            ("extract_plain_text", DataFrameColumnExtracter("text")),
            ("plain_text_vect", TfidfVectorizer()),
        ])
meta_text = Pipeline([
            ("extract_meta_text", DataFrameColumnExtracter("meta")),
            ("meta_text_vect", TfidfVectorizer()),
        ])
    
pipe = Pipeline([
    ("features", FeatureUnion([
        ("plain_text", plain_text),
        ("meta_text", meta_text),
    ])),
    ("xgb_linear", XGBClassifier(booster="gblinear")),
    #("svm", LinearSVC())
])

In [437]:
X_train, y_train = train, train.group_representative_label
X_test, y_test = test, test.group_representative_label

In [438]:
%%time
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
score = accuracy_score(pred, y_test)
score



CPU times: user 9.93 s, sys: 43.2 ms, total: 9.97 s
Wall time: 3.53 s


0.255

In [352]:
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import FunctionTransformer

X_train = np.array(["new york is a hell of a town",
                    "new york was originally dutch",
                    "new york is also called the big apple",
                    "nyc is nice",
                    "the capital of great britain is london. london is a huge metropolis which has a great many number of people living in it. london is also a very old town with a rich and vibrant cultural history.",
                    "london is in the uk. they speak english there. london is a sprawling big city where it's super easy to get lost and i've got lost many times.",
                    "london is in england, which is a part of great britain. some cool things to check out in london are the museum and buckingham palace.",
                    "london is in great britain. it rains a lot in britain and london's fogs are a constant theme in books based in london, such as sherlock holmes. the weather is really bad there.",])
y_train = np.array([[0],[0],[0],[0],[1],[1],[1],[1]])

X_test = np.array(["it's a nice day in nyc",
                   'i loved the time i spent in london, the weather was great, though there was a nip in the air and i had to wear a jacket.'
                   ])   
target_names = ['Class 1', 'Class 2']


def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)

classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('vectorizer', CountVectorizer(min_df=1,max_df=2)),
            ('tfidf', TfidfTransformer()),
        ])),
        ('length', Pipeline([
            ('count', FunctionTransformer(get_text_length, validate=False)),
        ]))
    ])),
    ('clf', LinearSVC())])

classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
predicted

  return f(*args, **kwargs)


array([0, 1])

In [359]:
dataset = pd.DataFrame({'TextColumn':['Sample Text1','Sample Text2'], 
                        'NumericColumn': [2,1]})
dataset.head()

Unnamed: 0,TextColumn,NumericColumn
0,Sample Text1,2
1,Sample Text2,1


In [360]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from scipy import sparse

tv = TfidfVectorizer(min_df = 0.05, max_df = 0.5)
X = tv.fit_transform(dataset['TextColumn'])
vocab = tv.get_feature_names()

X1 = pd.DataFrame(X.toarray(), columns = vocab)
X1['NumericColumn'] = dataset['NumericColumn']


X_sparse = sparse.csr_matrix(X1.values)

In [361]:
print(X_sparse.shape)
print(X.shape)

(2, 3)
(2, 2)


In [None]:
LinearSVC().fit(X_sparse, y_train)