# Test featurize

In [101]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# sklearn classification
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

# sklearn general
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import (accuracy_score,
                             confusion_matrix, 
                             classification_report, 
                             f1_score, 
                             precision_score,
                             recall_score)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier


from stop_words import get_stop_words
import ujson as json

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from app.utils import (clean_boilerplate, 
                       clean_string,
                       clean_website, 
                       detect_XML, 
                       extract_meta_informations,
                       reduce_whitespace,
                       remove_special_characters,
                       remove_tags, 
                       tokenizing_html, 
                       trim_html)

In [133]:
%%time
train = pd.read_csv("../data/ctrain.csv", nrows=1000).fillna("")
train = train.fillna("")
test = pd.read_csv("../data/ctest.csv", nrows=200).fillna("")
test = test.fillna("")
train.head(1)

CPU times: user 1.57 s, sys: 552 ms, total: 2.12 s
Wall time: 2.41 s


Unnamed: 0,url,industry,industry_label,group,group_representative,html,text,source,country,group_representative_label,meta,chtml
0,http://12-18.com,31,Hospitality,"rec, serv, tran",30,"<!DOCTYPE html>\n<html lang=""de"">\n<head>\n\t<...",12.18. Investment Management - ANDERS. AUS PRI...,xing,DE,"Leisure, Travel & Tourism",Ein glückliches Investment ist das Resultat ha...,<html>\n<head>\n\t<title>12.18. Investment Man...


In [134]:
def get_count_vectors(text, vectorizer=None):
    if vectorizer:
        vector = vectorizer.transform(text).toarray().tolist()
        return vector, _
    else:
        vectorizer = TfidfVectorizer()
        vector = vectorizer.fit_transform(text).toarray().tolist()
        return vector, vectorizer

def get_feature_mtx(data, vectorizer=None):
    """ """
    mtx = []
    
    vector, vectorizer = get_vectors(data.text, vectorizer=vectorizer)
    mtx.extend(vector)

    
    #mtx = list(map(list, zip(*mtx)))
    return np.array(mtx), vectorizer

In [135]:
X_train, vectorizer = get_feature_mtx(train)
y_train = train.group_representative_label
X_test, _ = get_feature_mtx(test, vectorizer=vectorizer)
y_test = test.group_representative_label

In [136]:
%%time
clf = LinearSVC()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = accuracy_score(pred, y_test)
score

CPU times: user 785 ms, sys: 0 ns, total: 785 ms
Wall time: 714 ms


0.455