## import libraries and load dataset

In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)

In [2]:
df_question = pd.read_hdf("/home/shubhamsingh/Desktop/desktop/files/ML/auto_tagging_data/auto_tagging_data_v2.h5")

### inspect data

In [3]:
df_question.sample(5, random_state = 11)

Unnamed: 0,Id,Title,Body,Tags
41763,92185,Why is Sampling Importance Resampling (SIR) better than Importance Sampling (IS)?,"<p>From what I understand, SIR is a mechanism for sampling from a distribution $p$ that works as follows:</p>\n\n<ol>\n<li>Approximate a target distribution $p$ using an importance sample $S$ fro...","[sampling, mcmc]"
4245,179778,optimization approach in logistic regression,<p>In logistic regression we need to maximise the log likelihood which boils down to minimising a function which is sum of multiple log functions. We normally use gradient descent approach there. ...,"[machine-learning, logistic, classification, optimization]"
37183,168679,Consequences of violating proportional hazards assumption in Cox model,"<p>What are the consequences of violating the Proportional Hazards assumption in a Cox Model? I've got a Model where two factors are highly significative, but all the estimated betas associated to...","[regression, survival, cox-model]"
55932,144226,Moments and density tails,"<p>Assume that the first $n$ moments $m_1,\dots\,m_n$ of a random variable $X\in\mathbb{R}$ are known, but not its probability density function $p(x)$. </p>\n\n<p>Does there exist a methodology to...","[probability, pdf]"
47629,142745,What is the demonstration of the variance of the difference of two dependent variables?,"<p>I know that the variance of the difference of two independent variables is the sum of variances, and I can prove it. I want to know where the covariance goes in the other case.</p>\n","[variance, covariance]"


In [4]:
df_question.head()

Unnamed: 0,Id,Title,Body,Tags
0,6,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Mach...",[machine-learning]
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...,[forecasting]
2,22,Bayesian and frequentist reasoning in plain English,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n,[bayesian]
3,31,What is the meaning of p values and t values in statistical tests?,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests....","[hypothesis-testing, t-test, p-value, interpretation]"
4,36,Examples for teaching: Correlation does not mean causation,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth ...",[correlation]


#### Let's combine the columns Title and Body as we have to extract tags from both of them 

In [5]:
df_question["Text"] = df_question["Title"]+ " " + df_question["Body"]

In [6]:
df_question.head()

Unnamed: 0,Id,Title,Body,Tags,Text
0,6,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Mach...",[machine-learning],"The Two Cultures: statistics vs. machine learning? <p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statisti..."
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...,[forecasting],Forecasting demographic census <p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census ...
2,22,Bayesian and frequentist reasoning in plain English,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n,[bayesian],Bayesian and frequentist reasoning in plain English <p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n
3,31,What is the meaning of p values and t values in statistical tests?,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests....","[hypothesis-testing, t-test, p-value, interpretation]","What is the meaning of p values and t values in statistical tests? <p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk b..."
4,36,Examples for teaching: Correlation does not mean causation,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth ...",[correlation],"Examples for teaching: Correlation does not mean causation <p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustra..."


### clean and pre-process data


In [7]:
df_question["Text"].head()

0    The Two Cultures: statistics vs. machine learning? <p>Last year, I read a blog post from <a href="http://anyall.org/">Brendan O'Connor</a> entitled <a href="http://anyall.org/blog/2008/12/statisti...
1    Forecasting demographic census <p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census ...
2                             Bayesian and frequentist reasoning in plain English <p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n
3    What is the meaning of p values and t values in statistical tests? <p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk b...
4    Examples for teaching: Correlation does not mean causation <p>There is an old saying: "Correlation does not mean causation". When I teach, I tend to use the following standard

In [8]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = ' '.join(text.split())
    return text

In [9]:
df_question["Text"] = df_question["Text"].apply(lambda x: clean_text(x))

In [10]:
#converting it into lower case
df_question["Text"] = df_question["Text"].str.lower()

In [11]:
df_question["Text"].head()

0    the two cultures statistics vs machine learning last year i read a blog post from brendan o connor entitled statistics vs machine learning fight that discussed some of the differences between the ...
1    forecasting demographic census what are some of the ways to forecast demographic census with some validation and calibration techniques some of the concerns census blocks vary in sizes as rural ar...
2                                       bayesian and frequentist reasoning in plain english how would you describe in plain english the characteristics that distinguish bayesian from frequentist reasoning
3    what is the meaning of p values and t values in statistical tests after taking a statistics course and then trying to help fellow students i noticed one subject that inspires much head desk bangin...
4    examples for teaching correlation does not mean causation there is an old saying correlation does not mean causation when i teach i tend to use the following standard examples

In [12]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [13]:
def remove_stopwords(text):
    clean_text = [w for w in text.split() if w not in stop_words]
    return ' '.join(clean_text)

In [14]:
df_question["Text_clean"] = df_question["Text"].apply(lambda x: remove_stopwords(x))

In [15]:
df_question["Text_clean"].head()

0    two cultures statistics vs machine learning last year read blog post brendan connor entitled statistics vs machine learning fight discussed differences two fields andrew gelman responded favorably...
1    forecasting demographic census ways forecast demographic census validation calibration techniques concerns census blocks vary sizes rural areas lot larger condensed urban areas need account area s...
2                                                                       bayesian frequentist reasoning plain english would describe plain english characteristics distinguish bayesian frequentist reasoning
3    meaning p values values statistical tests taking statistics course trying help fellow students noticed one subject inspires much head desk banging interpreting results statistical hypothesis tests...
4    examples teaching correlation mean causation old saying correlation mean causation teach tend use following standard examples illustrate point number storks birth rate denmark

### onehot encoding the targer values by using sklearn's multilabel binarizer transformer 

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

In [17]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_question["Tags"])

#transform target variable
Y = multilabel_binarizer.transform(df_question["Tags"])

In [18]:
#checking the shape
Y.shape

(76365, 100)

#### It has become an array of 100 columns, each column representing each tag 

### Now let's create features from text 

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [112]:
#feature extraction
tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, max_features = 10000)

X = tfidf_vectorizer.fit_transform(df_question["Text_clean"])

### Build multilabel classification model

In [113]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size = 0.2, random_state = 9)

#### Now we have to train 100 models as we have 100 different tags as the number of tags varies for each row

In [114]:
from sklearn .linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score

In [115]:
lr = LogisticRegression()

#classifier
clf = OneVsRestClassifier(lr)

In [116]:
clf.fit(xtrain, ytrain)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [117]:
y_pred = clf.predict(xtest)

In [118]:
print(y_pred[:3])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


#### This prediction is quite difficult to read and understand as the output is in one hot encoding, we need to transform them back to their normal form 

In [119]:
multilabel_binarizer.inverse_transform(y_pred[:3])

[('prediction',), ('distributions', 'mean', 'variance'), ('r',)]

#### Now lets check what were the actual tags 

In [120]:
multilabel_binarizer.inverse_transform(ytest[:3])

[('confidence-interval', 'regression'),
 ('distributions', 'mean', 'variance'),
 ('bayesian', 'r')]

### Measuring performance using f1_score 

In [121]:
f1_score(ytest, y_pred, average = "micro")

0.43527239150507857

In [122]:
#predicting probabilities
y_pred_prob = clf.predict_proba(xtest)

In [123]:
#setting threshhold value
t = 0.45

In [124]:
y_pred_2 = (y_pred_prob >= t).astype(int)

In [125]:
f1_score(ytest, y_pred_2, average = "micro")

0.45988232147633057

### Building the inference function for new input 

In [175]:
def infer_tags(q):
    q = clean_text(q)
    q = q.lower()
    q = remove_stopwords(q)
    q_test = tfidf_vectorizer.transform([q])
    q_pred = clf.predict(q_test)
    return multilabel_binarizer.inverse_transform(q_pred)

In [176]:
new_q = "Regression line in ggplot doesn't match computed regression Im using R and created a chart using ggplot2. I then create a regression so I can make some predicitions I pass my data frame of to the predict function predict(regression, Measures) I'd expect the predictions to be the same as if I used the regression line on the chart, but they aren't the same. Why would this be the case? Is there a setting in ggplot or is my expectation incorrect?"


In [177]:
infer_tags(new_q)

[('r', 'regression')]