# Multilabel Classification using Tensorflow Core

### Dataset : Question from cross Validation stack exchange

API : 
```
kaggle datasets download -d stackoverflow/statsquestions
```

In [4]:
# ! pip install kaggle -q

In [2]:
import os

In [22]:
! cd ../../ && ls

'architecture implementations'	 LICENSE
 daily_logs.md			 README.md
 kaggle.json			'tensorflow from basic to advance'
'keras basic to advance'


In [23]:
os.environ["KAGGLE_CONFIG_DIR"] = "../../"

In [24]:
!kaggle datasets download -d stackoverflow/statsquestions

Downloading statsquestions.zip to /home/suman/Desktop/365 days of ML challange/architecture implementations/Logistic Regression
100%|████████████████████████████████████████| 158M/158M [00:26<00:00, 6.64MB/s]
100%|████████████████████████████████████████| 158M/158M [00:26<00:00, 6.19MB/s]


In [25]:
! chmod 600 ../../kaggle.json

In [30]:
! ls

'Binary Logistic Regression with Tensorflow core.ipynb'
'Multiclass Logistic Regression with tensorfow core.ipynb'
'Multilabel Classification.ipynb'
 statsquestions.zip
'Vanishing Gradient problem.ipynb'
'with Tensorflow core.ipynb'


In [31]:
import shutil

zip_file_path = "statsquestions.zip"
destination_directory = "dataset"

shutil.unpack_archive(zip_file_path, destination_directory, 'zip')

In [32]:
!ls

'Binary Logistic Regression with Tensorflow core.ipynb'
 dataset
'Multiclass Logistic Regression with tensorfow core.ipynb'
'Multilabel Classification.ipynb'
 statsquestions.zip
'Vanishing Gradient problem.ipynb'
'with Tensorflow core.ipynb'


In [3]:
os.listdir("dataset")

['database.sqlite', 'Answers.csv', 'Tags.csv', 'Questions.csv']

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [5]:
df_questions = pd.read_csv("dataset/Questions.csv", encoding='iso-8859-1')
df_tags = pd.read_csv("dataset/Tags.csv", encoding='iso-8859-1')

In [6]:
df_questions.shape, df_tags.shape

((85085, 6), (244228, 2))

In [7]:
df_questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,<p>What are some of the ways to forecast demog...
2,22,66.0,2010-07-19T19:25:39Z,208,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...
3,31,13.0,2010-07-19T19:28:44Z,138,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...
4,36,8.0,2010-07-19T19:31:47Z,58,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n..."


In [8]:
df_tags.head()

Unnamed: 0,Id,Tag
0,1,bayesian
1,1,prior
2,1,elicitation
3,2,distributions
4,2,normality


In [9]:
df_tags.Tag.value_counts().head(10)

Tag
r                     13236
regression            10959
machine-learning       6089
time-series            5559
probability            4217
hypothesis-testing     3869
self-study             3732
distributions          3501
logistic               3316
classification         2881
Name: count, dtype: int64

In [10]:
df_tags.Tag.value_counts().sample(10)

Tag
pls                       109
geometric-distribution     47
abbreviation                4
pivot-table                 7
software                  151
cubic                       2
sql                        23
c#                         20
euclidean                  60
mcmc                      635
Name: count, dtype: int64

# Taking only the top 100 frequent tags

In [11]:
NUM_CLASS = 100

In [12]:
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(NUM_CLASS, columns="count")

In [13]:
most_common_tags

Unnamed: 0,Tag,count
986,r,13236
1020,regression,10959
669,machine-learning,6089
1220,time-series,5559
946,probability,4217
...,...,...
818,nonlinear-regression,514
240,cox-model,510
757,monte-carlo,504
959,proportion,503


In [14]:
MOST_COMMON_TAGS = list(set(most_common_tags.Tag.to_list()))

In [15]:
len(MOST_COMMON_TAGS)

100

In [16]:
", ".join(MOST_COMMON_TAGS)

'python, standard-deviation, goodness-of-fit, regression-coefficients, optimization, normal-distribution, conditional-probability, variance, correlation, interaction, ordinal, terminology, standard-error, probability, neural-networks, references, bayesian, categorical-data, anova, regression, prediction, expected-value, outliers, econometrics, bootstrap, spss, p-value, nonlinear-regression, random-variable, model-selection, covariance, dataset, deep-learning, data-visualization, algorithms, statistical-significance, survey, stochastic-processes, multiple-comparisons, classification, stata, clustering, feature-selection, distributions, t-test, self-study, multilevel-analysis, random-effects-model, cart, svm, chi-squared, logistic, arima, experiment-design, linear-model, proportion, maximum-likelihood, hypothesis-testing, autocorrelation, multivariate-analysis, mcmc, monte-carlo, sample-size, estimation, model, poisson, confidence-interval, data-transformation, repeated-measures, modelin

Now, let's remove tags which are not common

In [17]:
df_tags.Tag = df_tags.Tag.apply(lambda tag: tag if tag in MOST_COMMON_TAGS else None)

In [18]:
df_tags.head()

Unnamed: 0,Id,Tag
0,1,bayesian
1,1,
2,1,
3,2,distributions
4,2,


In [19]:
df_tags.dropna(inplace=True)

In [20]:
df_tags.head()

Unnamed: 0,Id,Tag
0,1,bayesian
3,2,distributions
7,4,distributions
8,4,statistical-significance
9,6,machine-learning


# Preprocessing

In [21]:
df_questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,<p>What are some of the ways to forecast demog...
2,22,66.0,2010-07-19T19:25:39Z,208,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...
3,31,13.0,2010-07-19T19:28:44Z,138,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...
4,36,8.0,2010-07-19T19:31:47Z,58,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n..."


In [22]:
df_questions = df_questions[['Id', 'Title', 'Body']]
df_questions.head()

Unnamed: 0,Id,Title,Body
0,6,The Two Cultures: statistics vs. machine learn...,"<p>Last year, I read a blog post from <a href=..."
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demog...
2,22,Bayesian and frequentist reasoning in plain En...,<p>How would you describe in plain English the...
3,31,What is the meaning of p values and t values i...,<p>After taking a statistics course and then t...
4,36,Examples for teaching: Correlation does not me...,"<p>There is an old saying: ""Correlation does n..."


In [23]:
df_questions.iloc[0].Body

'<p>Last year, I read a blog post from <a href="http://anyall.org/">Brendan O\'Connor</a> entitled <a href="http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/">"Statistics vs. Machine Learning, fight!"</a> that discussed some of the differences between the two fields.  <a href="http://andrewgelman.com/2008/12/machine_learnin/">Andrew Gelman responded favorably to this</a>:</p>\n\n<p>Simon Blomberg: </p>\n\n<blockquote>\n  <p>From R\'s fortunes\n  package: To paraphrase provocatively,\n  \'machine learning is statistics minus\n  any checking of models and\n  assumptions\'.\n  -- Brian D. Ripley (about the difference between machine learning\n  and statistics) useR! 2004, Vienna\n  (May 2004) :-) Season\'s Greetings!</p>\n</blockquote>\n\n<p>Andrew Gelman:</p>\n\n<blockquote>\n  <p>In that case, maybe we should get rid\n  of checking of models and assumptions\n  more often. Then maybe we\'d be able to\n  solve some of the problems that the\n  machine learning people can

In [24]:
df_questions.iloc[0].Title

'The Two Cultures: statistics vs. machine learning?'

In [25]:
import re

HTML_TAG_PATTERN = "<.*?>"

test_text = "<HTML><a href='example.com'>click me</a></HTML>"
re.sub(HTML_TAG_PATTERN, "", test_text)

'click me'

In [26]:
def clean_text(text):
    HTML_TAG_PATTERN = "<.*?>"
    clean_text = re.sub(HTML_TAG_PATTERN, "", text).lower()
    return " ".join(re.sub(r"[^a-zA-Z0-9 ']", ' ', clean_text).split())

In [27]:
df_questions.iloc[:10].Body.apply(clean_text)[0]

"last year i read a blog post from brendan o'connor entitled statistics vs machine learning fight that discussed some of the differences between the two fields andrew gelman responded favorably to this simon blomberg from r's fortunes package to paraphrase provocatively 'machine learning is statistics minus any checking of models and assumptions' brian d ripley about the difference between machine learning and statistics user 2004 vienna may 2004 season's greetings andrew gelman in that case maybe we should get rid of checking of models and assumptions more often then maybe we'd be able to solve some of the problems that the machine learning people can solve but we can't there was also the statistical modeling the two cultures paper by leo breiman in 2001 which argued that statisticians rely too heavily on data modeling and that machine learning techniques are making progress by instead relying on the predictive accuracy of models has the statistics field changed over the last decade i

In [28]:
df_questions["Body"] = df_questions.Body.apply(clean_text)
df_questions["Text"] = df_questions.Title.apply(clean_text)+ ' ' + df_questions["Body"]

In [44]:
df_questions.Text[0]

"the two cultures statistics vs machine learning last year i read a blog post from brendan o'connor entitled statistics vs machine learning fight that discussed some of the differences between the two fields andrew gelman responded favorably to this simon blomberg from r's fortunes package to paraphrase provocatively 'machine learning is statistics minus any checking of models and assumptions' brian d ripley about the difference between machine learning and statistics user 2004 vienna may 2004 season's greetings andrew gelman in that case maybe we should get rid of checking of models and assumptions more often then maybe we'd be able to solve some of the problems that the machine learning people can solve but we can't there was also the statistical modeling the two cultures paper by leo breiman in 2001 which argued that statisticians rely too heavily on data modeling and that machine learning techniques are making progress by instead relying on the predictive accuracy of models has the

## Merge Two tables

In [45]:
df_tags[df_tags["Id"] == 2].Tag.values

array(['distributions'], dtype=object)

In [46]:
df_tags[df_tags["Id"] == -2].Tag.values

array([], dtype=object)

In [32]:
def extract_tags_by_id(q_id):
    return df_tags[df_tags["Id"] == q_id].Tag.values

In [33]:
extract_tags_by_id(10)

array(['ordinal'], dtype=object)

In [34]:
def add_tags_row(row):
    row["Tags"] = extract_tags_by_id(row["Id"])
    return row

In [35]:
df_questions.iloc[1]

Id                                                      21
Title                       Forecasting demographic census
Body     what are some of the ways to forecast demograp...
Text     forecasting demographic census what are some o...
Name: 1, dtype: object

In [36]:
add_tags_row(df_questions.iloc[1])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["Tags"] = extract_tags_by_id(row["Id"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["Tags"] = extract_tags_by_id(row["Id"])


Id                                                      21
Title                       Forecasting demographic census
Body     what are some of the ways to forecast demograp...
Text     forecasting demographic census what are some o...
Tags                                         [forecasting]
Name: 1, dtype: object

In [37]:
df_questions = df_questions.apply(add_tags_row, axis=1)

In [48]:
df_questions.head()[["Id", "Title", "Tags"]]

Unnamed: 0,Id,Title,Tags
0,6,The Two Cultures: statistics vs. machine learn...,[machine-learning]
1,21,Forecasting demographic census,[forecasting]
2,22,Bayesian and frequentist reasoning in plain En...,[bayesian]
3,31,What is the meaning of p values and t values i...,"[hypothesis-testing, t-test, p-value, interpre..."
4,36,Examples for teaching: Correlation does not me...,[correlation]


# Tokenization

In [39]:
# ! pip install scikit-learn -q

In [57]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

In [41]:
multi_label_binarizer = MultiLabelBinarizer()
multi_label_binarizer.fit(df_questions.Tags)

In [54]:
LABELS = multi_label_binarizer.classes_
LABELS

array(['algorithms', 'anova', 'arima', 'autocorrelation', 'bayesian',
       'binary-data', 'binomial', 'bootstrap', 'cart', 'categorical-data',
       'chi-squared', 'classification', 'clustering',
       'conditional-probability', 'confidence-interval', 'correlation',
       'covariance', 'cox-model', 'cross-validation', 'data-mining',
       'data-transformation', 'data-visualization', 'dataset',
       'deep-learning', 'distributions', 'econometrics', 'estimation',
       'expected-value', 'experiment-design', 'factor-analysis',
       'feature-selection', 'forecasting', 'generalized-linear-model',
       'goodness-of-fit', 'hypothesis-testing', 'inference',
       'interaction', 'interpretation', 'least-squares', 'linear-model',
       'logistic', 'machine-learning', 'mathematical-statistics',
       'matlab', 'maximum-likelihood', 'mcmc', 'mean', 'missing-data',
       'mixed-model', 'model', 'model-selection', 'modeling',
       'monte-carlo', 'multilevel-analysis', 'multiple-co

In [43]:
MAX_LEN = 180
VOCAB_SIZE = 1000

tokenizer = Tokenizer(num_words=VOCAB_SIZE, lower=True)
tokenizer.fit_on_texts(df_questions.Text)

In [51]:
def text_to_feature_vector(corpus):
    sequences = tokenizer.texts_to_sequences(corpus)
    return pad_sequences(sequences, MAX_LEN)

In [52]:
text_to_feature_vector([df_questions.Text.values[0]])

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
         60, 262, 345, 543, 276, 637, 324,   2, 355,   3, 411,  37, 262,
        345, 543, 276,  12,  63,   5,   1, 472,  76,   1,  60,   4,  15,
         37, 277,   4, 276,   7, 262,  72,   5, 143,   9, 112,  94,   1,
        173,  76, 543, 276,   9, 262, 684, 279,  10,  12, 151, 634,  55,
         84, 109,   5,   5, 143,   9, 747,  96, 769,  82, 634,  21, 404,
          4, 641,  63,   5,   1, 656,  12,   1, 543, 276, 348,  33, 641,
         32,  55, 446,  45, 102, 111,   1, 243, 841,   1,  60, 447,  54,
         10,  48,  12, 456,  22,  16, 841,   9,  12, 543, 276, 949,  19,
        978,  54, 455,  22,   1, 892, 454,   5, 143, 104,   1, 262, 199,
          1, 637,  10, 260,   4,  99,  49,   1,  60, 352,  34, 104, 262,
          4, 543, 276, 949, 172,  23, 550, 974,   9

In [315]:
def prediction_to_label(preds):
    label_prob = [(LABELS[i], prob[0]) for i, prob in enumerate(preds.numpy().tolist())]
    return dict(sorted(label_prob, key=lambda kv:kv[1], reverse=True))

In [316]:
dummy_preds = tf.random.uniform([10, 1])
dummy_preds

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[0.28406227],
       [0.36518085],
       [0.9222305 ],
       [0.56961286],
       [0.764382  ],
       [0.02735341],
       [0.5669421 ],
       [0.64930797],
       [0.0144248 ],
       [0.6560919 ]], dtype=float32)>

In [317]:
prediction_to_label(dummy_preds)

{'arima': 0.9222304821014404,
 'bayesian': 0.764382004737854,
 'categorical-data': 0.6560919284820557,
 'bootstrap': 0.6493079662322998,
 'autocorrelation': 0.5696128606796265,
 'binomial': 0.5669420957565308,
 'anova': 0.3651808500289917,
 'algorithms': 0.2840622663497925,
 'binary-data': 0.027353405952453613,
 'cart': 0.014424800872802734}

## Prepare the Input  data

In [128]:
x_train = text_to_feature_vector(df_questions.Text)
y_train = multi_label_binarizer.transform(df_questions.Tags)

In [130]:
x_train[20]

array([149,   2,  95,   4, 156,   3,   5,  51,   7, 325, 628, 130,   2,
       215, 130,   2, 108,   8,   9,  78,  11,  51, 678,   2,  48,  20,
         7,   1,   7, 386,   4,  21, 351, 450, 330,   2,  41,  69,   4,
         1, 713,  66, 405,  10, 256,  70,   9,  10,   1, 149,   1, 256,
        70,   7, 270, 140, 175,   1,  71,  75,  11,  51,   9,  82, 175,
         1,  71,   5, 210,   1, 149,   7,  96,  45,  19,  60,   1, 149,
         5, 713,   9,   1, 149,   5,  75,  10, 237,   4,  18,  44, 428,
        22,   1, 244,  55,  18,  10,  72, 393, 713,  55, 150,   4, 635,
        11, 185,   1,   2, 446, 222,   5,   3, 218, 154,  20,   7,  12,
        74,   1, 497,  10,   3, 393,   9, 633,   1,   5,  15, 840,   3,
       218, 428,  72, 484,  93, 212,  25,  15,  25, 226,   4,  21,   1,
       127, 503,  11,  24, 151,  11,   1, 642,  12,  20,   7,  10, 947,
        11,   1, 429,  85,  51, 147,  10,  24, 151,   7, 130,   2,   7,
       594,  11,  74,   1,  48,   7,  29, 203,  10,  24, 151], d

In [131]:
y_train[20]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [145]:
x_train.shape, y_train.shape

((85085, 180), (85085, 100))

In [149]:
x_train_sample = x_train[:10_000]
y_train_sample = y_train[:10_000]

x_train_sample.shape, y_train_sample.shape

((10000, 180), (10000, 100))

In [174]:
x = tf.cast(tf.transpose(x_train_sample), "float32")/VOCAB_SIZE
y = tf.constant(tf.transpose(y_train_sample))/VOCAB_SIZE

In [175]:
x.shape, y.shape

(TensorShape([180, 10000]), TensorShape([100, 10000]))

# Sigmoid Activation Function

In [176]:
def sigmoid(z):
    exp_z = tf.math.exp(-z)
    return 1 / (1 + exp_z)

# Loss function

In [177]:
def binary_cross_entropy(y_true, y_pred, from_logits=False):
    y_true = tf.cast(tf.constant(y_true), "float32")
    y_pred = tf.cast(tf.constant(y_pred), "float32")
    if from_logits:
        y_pred = sigmoid(y_pred)
    epsilon = 1e-7  # A small value to prevent log(1) condition
    y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)
    loss = y_true * tf.math.log(y_pred) + (1 - y_true)*tf.math.log(1-y_pred)
    return -tf.reduce_mean(loss)

In [178]:
dummy_preds = tf.random.uniform([10])
dummy_preds

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([0.17625475, 0.9428749 , 0.9179132 , 0.34259653, 0.09808779,
       0.56494594, 0.575238  , 0.76359785, 0.49316752, 0.39830625],
      dtype=float32)>

In [179]:
dummy_y_true = tf.math.round(tf.random.uniform([10]))
dummy_y_true

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 0., 0., 1., 0., 0., 1., 1., 1.], dtype=float32)>

In [180]:
binary_cross_entropy(dummy_y_true, dummy_preds)

<tf.Tensor: shape=(), dtype=float32, numpy=0.9079709>

In [181]:
dummy_y_true = tf.math.round(dummy_preds)
dummy_y_true

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 0., 0.], dtype=float32)>

In [182]:
binary_cross_entropy(dummy_y_true, dummy_preds)

<tf.Tensor: shape=(), dtype=float32, numpy=0.34423557>

In [183]:
binary_cross_entropy(dummy_y_true, dummy_y_true)

<tf.Tensor: shape=(), dtype=float32, numpy=1.192093e-07>

# Model Building

In [251]:
class MultiLabelLogisticRegression(tf.Module):
    def __init__(self, n_classes, name=None, **kwargs):
        super().__init__(name=name, **kwargs)
        self.is_built = False
        self.n_class = n_classes
    
    def build(self, num_features):
        # randomly generate bias and weights
        self.weights = tf.Variable(tf.random.normal([self.n_class, num_features]), name="weights")
        self.bias = tf.Variable(tf.zeros([self.n_class, 1]), name="bias")
        print("build successfully")
        
    def predict(self, x):
        #compute model output
        # compute logits
        logits = self.weights @ x + self.bias
        return sigmoid(logits)
    
    def __call__(self, x):
        if not self.is_built:
            self.build(x.shape[0])
            self.is_built = True
        return self.predict(x)

In [252]:
model = MultiLabelLogisticRegression(len(LABELS))
output = model(x[:, :10])
output.shape, output[:, 0]

build successfully


(TensorShape([100, 10]),
 <tf.Tensor: shape=(100,), dtype=float32, numpy=
 array([5.41225433e-01, 8.06589782e-01, 8.24252546e-01, 4.90969121e-01,
        5.42748952e-03, 8.99103165e-01, 4.07781959e-01, 9.93144512e-01,
        2.53488198e-02, 8.43413651e-01, 5.29198647e-01, 5.62053740e-01,
        9.02662992e-01, 9.80902076e-01, 3.90235335e-01, 9.80073392e-01,
        2.35534966e-01, 9.03265119e-01, 5.06120205e-01, 5.77651244e-03,
        9.39405799e-01, 9.23120603e-02, 6.38845995e-06, 9.45330322e-01,
        1.67244478e-04, 2.72230804e-01, 9.99994516e-01, 1.75339375e-02,
        3.85065615e-01, 6.94571793e-01, 4.77428406e-01, 1.25839144e-01,
        1.21650375e-01, 5.95242195e-02, 9.98339891e-01, 1.88916340e-01,
        8.93396318e-01, 1.43894345e-01, 5.09555265e-03, 9.53627586e-01,
        2.47141004e-01, 9.46795285e-01, 4.38318551e-02, 9.98462319e-01,
        7.21878529e-01, 5.20805836e-01, 5.42146444e-01, 9.89803791e-01,
        4.13427383e-01, 9.99831200e-01, 7.49128819e-01, 1.2340

In [253]:
model.weights.shape

TensorShape([100, 180])

# Accuracy Metrics

In [254]:
def calculate_accuracy(y_true, y_pred, threshold=0.5):
    y_pred = tf.cast(y_pred >= 0.5, "int16")
    y_true = tf.cast(y_true, "int16")
    match = tf.cast(tf.equal(y_true, y_pred), "float32")
    return tf.reduce_mean(match)

# Train Function

In [255]:
# defining training function
def train(model, x_train, y_train, learning_rate=0.01):
    with tf.GradientTape() as tape:
        y_pred = model(x_train)
        acc = calculate_accuracy(y_train, y_pred)
        loss = binary_cross_entropy(y_train, y_pred)

    dw, db = tape.gradient(loss, [model.weights, model.bias])
    model.weights.assign_sub(learning_rate * dw)
    model.bias.assign_sub(learning_rate * db)
    return loss, acc

In [256]:
train(model, x, y, learning_rate=0.01)

(<tf.Tensor: shape=(), dtype=float32, numpy=1.3514203>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.502557>)

# Training Loop

In [262]:
def display_training_summary(loss, accuracy, epoch):
    if epoch % 10 == 0:
        print(f"EPOCH: {epoch} --- loss: {loss}---accuracy: {accuracy}")

In [263]:
#model initiate
model = MultiLabelLogisticRegression(len(LABELS))
LOSS_HISTORY = []
ACC_HISTORY = []

In [331]:
NUM_EPOCHS = 200
LEARNING_RATE = 1

#  Training Loop
for epoch in range(NUM_EPOCHS):
    loss, acc = train(model, x, y, LEARNING_RATE)
    LOSS_HISTORY.append(loss)
    ACC_HISTORY.append(acc)
    display_training_summary(loss, acc, epoch+1)

EPOCH: 10 --- loss: 0.2084408700466156---accuracy: 0.921949028968811
EPOCH: 20 --- loss: 0.20296968519687653---accuracy: 0.9244179725646973
EPOCH: 30 --- loss: 0.19774170219898224---accuracy: 0.9268820285797119
EPOCH: 40 --- loss: 0.1927422434091568---accuracy: 0.9291769862174988
EPOCH: 50 --- loss: 0.187957763671875---accuracy: 0.9313349723815918
EPOCH: 60 --- loss: 0.18337567150592804---accuracy: 0.9334579706192017
EPOCH: 70 --- loss: 0.17898434400558472---accuracy: 0.9354559779167175
EPOCH: 80 --- loss: 0.1747729331254959---accuracy: 0.9374570250511169
EPOCH: 90 --- loss: 0.170731320977211---accuracy: 0.9393569827079773
EPOCH: 100 --- loss: 0.1668502539396286---accuracy: 0.9411569833755493
EPOCH: 110 --- loss: 0.16312091052532196---accuracy: 0.942874014377594
EPOCH: 120 --- loss: 0.15953519940376282---accuracy: 0.9444860219955444
EPOCH: 130 --- loss: 0.1560855358839035---accuracy: 0.9460189938545227
EPOCH: 140 --- loss: 0.15276479721069336---accuracy: 0.9475749731063843
EPOCH: 150 -

# Testing

In [343]:
text = df_questions.iloc[1].Text
y_true_label = df_questions.iloc[1].Tags 

In [344]:
text

"forecasting demographic census what are some of the ways to forecast demographic census with some validation and calibration techniques some of the concerns census blocks vary in sizes as rural areas are a lot larger than condensed urban areas is there a need to account for the area size difference if let's say i have census data dating back to 4 5 census periods how far can i forecast it into the future if some of the census zone change lightly in boundaries how can i account for that change what are the methods to validate census forecasts for example if i have data for existing 5 census periods should i model the first 3 and test it on the latter two or is there another way what's the state of practice in forecasting census data and what are some of the state of the art methods"

In [345]:
y_true_label

array(['forecasting'], dtype=object)

In [1]:
y_true_label = multi_label_binarizer.transform([y_true_label])
y_true_label

NameError: name 'multi_label_binarizer' is not defined

In [347]:
vector_rep = text_to_feature_vector([text])
vector_rep

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,  38,  19,  63,   5,   1, 846,   4, 569,  17,
         63, 379,   9, 949,  63,   5,   1,  10, 821,  23,  19,   3, 568,
        822, 119,   7,  45,   3, 150,   4, 635,  11,   1, 725, 155, 173,
         27, 505, 161,   2,  18,  16, 900,   4,  47,  42,  31, 443,  33,
          2, 569,  20, 164,   1,  27,  63,   5,   1, 315,  10,  31,  33,
          2, 635,  11,  12, 315,  38,  19,   1, 297,   4,  11, 110,  27,
          2,  18,  16,  11,  42,  84,   2,  25,   1, 140,  28,   9,  43,
         20,  22,   1,  60,  34,   7,  45, 302, 101,   1, 448,   5,  10,
         16,   9,  38,  19,  63,   5,   1, 448,   5

In [348]:
vector_rep.shape

(1, 180)

In [349]:
input_x = tf.cast(tf.transpose(vector_rep), "float32")
input_x.shape

TensorShape([180, 1])

In [350]:
y_pred = model(input_x)

In [351]:
tf.math.round(tf.squeeze(y_pred))

<tf.Tensor: shape=(100,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
      dtype=float32)>

In [352]:
calculate_accuracy(y_true_label, y_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=0.7842>

In [353]:
prediction_to_label(y_pred)

{'chi-squared': 1.0,
 'clustering': 1.0,
 'confidence-interval': 1.0,
 'covariance': 1.0,
 'distributions': 1.0,
 'generalized-linear-model': 1.0,
 'goodness-of-fit': 1.0,
 'inference': 1.0,
 'least-squares': 1.0,
 'machine-learning': 1.0,
 'mathematical-statistics': 1.0,
 'monte-carlo': 1.0,
 'normal-distribution': 1.0,
 'p-value': 1.0,
 'pca': 1.0,
 'sample-size': 1.0,
 'sampling': 1.0,
 'simulation': 1.0,
 'spss': 1.0,
 'standard-deviation': 1.0,
 'terminology': 1.0,
 'algorithms': 0.0,
 'anova': 0.0,
 'arima': 0.0,
 'autocorrelation': 0.0,
 'bayesian': 0.0,
 'binary-data': 0.0,
 'binomial': 0.0,
 'bootstrap': 0.0,
 'cart': 0.0,
 'categorical-data': 0.0,
 'classification': 0.0,
 'conditional-probability': 0.0,
 'correlation': 0.0,
 'cox-model': 0.0,
 'cross-validation': 0.0,
 'data-mining': 0.0,
 'data-transformation': 0.0,
 'data-visualization': 0.0,
 'dataset': 0.0,
 'deep-learning': 0.0,
 'econometrics': 0.0,
 'estimation': 0.0,
 'expected-value': 0.0,
 'experiment-design': 0.0,
