Article is posted [here](https://medium.com/@swarupsahoo/preparing-text-for-machine-learning-60396e4e393)

In [13]:
import pandas as pd
import tensorflow as tf
import string
import nltk
import sys
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

## Vectorization of text

In [14]:
df = pd.read_csv('https://raw.githubusercontent.com/dotnet/machinelearning\
/master/test/data/wikipedia-detox-250-line-data.tsv',
            sep='\t')

In [15]:
df.head(20)

Unnamed: 0,Sentiment,SentimentText
0,1,"==RUDE== Dude, you are rude upload that carl..."
1,1,== OK! == IM GOING TO VANDALIZE WILD ONES W...
2,1,"Stop trolling, zapatancas, calling me a lia..."
3,1,==You're cool== You seem like a really cool...
4,1,::::: Why are you threatening me? I'm not bei...
5,1,== hey waz up? == hey ummm... the fif four ...
6,0,::::::::::I'm not sure either. I think it has...
7,0,*::Your POV and propaganda pushing is dully n...
8,0,== File:Hildebrandt-Greg and Tim.jpg listed ...
9,0,::::::::This is a gross exaggeration. Nobody...


In [16]:
X = np.array(df['SentimentText'])

In [17]:
nltk.download('punkt', quiet=True, raise_on_error=True)
stemmer = SnowballStemmer('english')
p_map = dict((ord(char),None) for char in string.punctuation)

In [18]:
nltk.download('stopwords', quiet=True, raise_on_error=True)
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenized_stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english')))

In [19]:
def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens if item not in tokenized_stop_words]
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(p_map)))

tfidf = TfidfVectorizer(use_idf=True, 
                        ngram_range=(1,3), 
                        strip_accents='unicode',
                        tokenizer=normalize, 
                        analyzer='word',
                        max_features = 10000
                       )
  
tfidf_score = tfidf.fit_transform(X)

In [20]:
tfidf.get_feature_names()

['0',
 '0 googl',
 '0 googl hit',
 '0 hit',
 '0 hit holocaust',
 '0 imag',
 '0 imag copyright',
 '0 newslett',
 '0 newslett thank',
 '0 thanx',
 '0 thanx efe',
 '0 woah',
 '0 woah someon',
 '0518',
 '0518 utc',
 '0548',
 '0548 3',
 '0548 3 decemb',
 '10',
 '10 year',
 '10 year franc',
 '100',
 '100 im',
 '100 im go',
 '100 new',
 '100 new page',
 '100 page',
 '100 page address',
 '100 page still',
 '100 wp',
 '100 wp page',
 '11year',
 '11year old',
 '11year old mayb',
 '130',
 '130 year',
 '130 year old',
 '130000',
 '130000 year',
 '130000 year old',
 '13100',
 '13100 hit',
 '13100 hit big',
 '1423',
 '15',
 '15 year',
 '15 year old',
 '16',
 '16 august',
 '16 august 2008',
 '1682099734',
 '1682099734 basi',
 '1682099734 basi acus',
 '1717',
 '1717 utc',
 '17419166126',
 '17419166126 aka',
 '17419166126 aka 1741916992',
 '1741916992',
 '1741916992 appar',
 '1741916992 appar edit',
 '1987',
 '1987 sold',
 '1987 sold year',
 '1988',
 '1988 show',
 '1988 show case',
 '1989',
 '1989 jack

In [21]:
def matrix_to_list(matrix):
    matrix = matrix.toarray().round(decimals=3)
    return matrix.tolist()
score_list = np.array(matrix_to_list(tfidf_score))

In [22]:
Y = np.array(df['Sentiment'])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(score_list, Y, test_size=0.2, random_state=42)

In [25]:
X_train.shape

(196, 10000)

In [26]:
X_test.shape

(50, 10000)

<h2 style="color:red">Using tensorflow on a sparse dataset is kinda useless but whatever </h2>

### Code copied shamelessly from [Josh Meyer's Website](http://jrmeyer.github.io/machinelearning/2019/05/29/tensorflow-dataset-estimator-api.html)

## Creating a TF compatible dataset

In [27]:
def tfrecord(name, Xs, Ys):
    with tf.compat.v1.python_io.TFRecordWriter('%s.tfrecords'%name) as writer:
        for i in range(len(Xs)):
            label = int(Ys[i])
            feats = np.array([ float(feat) for feat in Xs[i] ]).tostring()
            example = tf.train.Example()
            example.features.feature["feats"].bytes_list.value.append(feats)
            example.features.feature["label"].int64_list.value.append(label)
            writer.write(example.SerializeToString())

In [28]:
tfrecord('train', X_train, y_train)
tfrecord('eval', X_test, y_test)

## Define function to parse data types for the inputs

In [29]:
def parser(record):
    '''
    This is a parser function. It defines the template for
    interpreting the examples you're feeding in. Basically, 
    this function defines what the labels and data look like
    for your labeled data. 
    '''

    # the 'features' here include your normal data feats along
    # with the label for that data
    features={
      'feats': tf.compat.v1.FixedLenFeature([], tf.string),
      'label': tf.compat.v1.FixedLenFeature([], tf.int64),
    }

    parsed = tf.compat.v1.parse_single_example(record, features)

    # some conversion and casting to get from bytes to floats and ints
    feats= tf.convert_to_tensor(tf.compat.v1.decode_raw(parsed['feats'], tf.float64))
    label= tf.cast(parsed['label'], tf.int64)

    # since you can have multiple kinds of feats, you return a dictionary for feats
    # but only an int for the label
    return {'feats': feats}, label

## Define function for interative evaluation

In [30]:
def my_input_fn(tfrecords_path):
    dataset = (
    tf.data.TFRecordDataset(tfrecords_path)
    .map(parser)
    .batch(16)
    )

    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)

    batch_feats, batch_labels = iterator.get_next()

    return batch_feats, batch_labels

In [31]:
DNNClassifier = tf.estimator.DNNClassifier(
  feature_columns = [tf.feature_column.numeric_column(key='feats', dtype=tf.float64, shape=(10000,))],
  hidden_units = [128, 128, 128, 128],
  n_classes = 2,
  model_dir = '/tmp/tf')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x130855b50>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [32]:
train_spec_dnn = tf.estimator.TrainSpec(input_fn = lambda: my_input_fn('train.tfrecords') , max_steps=10000)
eval_spec_dnn = tf.estimator.EvalSpec(input_fn = lambda: my_input_fn('eval.tfrecords') )

tf.estimator.train_and_evaluate(DNNClassifier, train_spec_dnn, eval_spec_dnn)

INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Call initial

({'accuracy': 0.6,
  'accuracy_baseline': 0.6,
  'auc': 0.49166667,
  'auc_precision_recall': 0.59588945,
  'average_loss': 0.6917891,
  'label/mean': 0.6,
  'loss': 0.6920581,
  'precision': 0.6,
  'prediction/mean': 0.5035713,
  'recall': 1.0,
  'global_step': 39},
 [])

## Trying out sklearn instead

In [35]:
%%time
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-2, 
                        hidden_layer_sizes=(256,256,2,256), 
                        max_iter=10000, random_state=123)
clf_mlp.fit(X=X_train, y = y_train)
y_pred = clf_mlp.predict(X_test)
print("Neural Model(LBFGS) Accuracy = %.2f%%"%(np.mean(y_pred==y_test)*100))

Neural Model(LBFGS) Accuracy = 60.00%
CPU times: user 13.8 s, sys: 2.18 s, total: 16 s
Wall time: 7.08 s


In [36]:
%%time
clf_rf = RandomForestClassifier(random_state=12, max_depth=10, n_estimators=100)
clf_rf.fit(X = X_train, y = y_train)
y_pred = clf_rf.predict(X_test)
print("Random Forest Accuracy = %.2f%%"%(np.mean(y_pred==y_test)*100))

Random Forest Accuracy = 62.00%
CPU times: user 342 ms, sys: 16.3 ms, total: 358 ms
Wall time: 364 ms


In [11]:
import string
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


nltk.download('punkt')
stemmer = SnowballStemmer('english')
p_map = dict((ord(char),None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(p_map)))


a = "oh wow dell sucks so much, dell is the worst oh wow"
b = "dell laptops are so good, much better than those dirt cheap macbooks"

tfidf = TfidfVectorizer(stop_words='english', 
                        use_idf=True, ngram_range=(1,4), 
                        strip_accents='unicode', 
                        tokenizer=normalize)

tfidf_score = tfidf.fit_transform([a, b])

def matrix_to_list(matrix):
    matrix = matrix.toarray().round(decimals=3)
    return matrix.tolist()
score_list = matrix_to_list(tfidf_score)

print(score_list)

[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.251, 0.0, 0.0, 0.0, 0.177, 0.177, 0.177, 0.177, 0.177, 0.177, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.353, 0.353, 0.177, 0.177, 0.177, 0.177, 0.177, 0.177, 0.177, 0.177, 0.177, 0.353, 0.177, 0.177, 0.177], [0.216, 0.216, 0.216, 0.216, 0.216, 0.216, 0.153, 0.216, 0.216, 0.216, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.216, 0.216, 0.216, 0.216, 0.216, 0.216, 0.216, 0.216, 0.216, 0.216, 0.216, 0.216, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/swarupsahoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  'stop_words.' % sorted(inconsistent))


In [12]:
tfidf.get_feature_names()

['better',
 'better dirt',
 'better dirt cheap',
 'better dirt cheap macbook',
 'cheap',
 'cheap macbook',
 'dell',
 'dell laptop',
 'dell laptop good',
 'dell laptop good better',
 'dell suck',
 'dell suck dell',
 'dell suck dell worst',
 'dell worst',
 'dell worst oh',
 'dell worst oh wow',
 'dirt',
 'dirt cheap',
 'dirt cheap macbook',
 'good',
 'good better',
 'good better dirt',
 'good better dirt cheap',
 'laptop',
 'laptop good',
 'laptop good better',
 'laptop good better dirt',
 'macbook',
 'oh',
 'oh wow',
 'oh wow dell',
 'oh wow dell suck',
 'suck',
 'suck dell',
 'suck dell worst',
 'suck dell worst oh',
 'worst',
 'worst oh',
 'worst oh wow',
 'wow',
 'wow dell',
 'wow dell suck',
 'wow dell suck dell']