# StackOverflow tag prediction

*  This notebook has two parts, in 1st part I've used SGD Classifier and in 2nd part I've used BiLSTM

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import seaborn as sns
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
# from jupyterthemes import jtplot

In [None]:
# from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
# from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Bidirectional, Embedding, Dropout, Flatten
from keras.optimizers import SGD, Adam, Adagrad, Adadelta, RMSprop

In [None]:
# set plot rc parameters

# jtplot.style(grid=False)
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = '#464646'
#plt.rcParams['axes.edgecolor'] = '#FFFFFF'
plt.rcParams['figure.figsize'] = 10, 7
plt.rcParams['text.color'] = '#666666'
plt.rcParams['axes.labelcolor'] = '#666666'
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.color'] = '#666666'
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.color'] = '#666666'
plt.rcParams['ytick.labelsize'] = 14

# plt.rcParams['font.size'] = 16

sns.color_palette('dark')
%matplotlib inline

## Read data

In [None]:
train = pd.read_csv('../input/facebook-recruiting-iii-keyword-extraction/Train.zip', usecols=['Id', 'Title', 'Tags'])
train.shape

In [None]:
train.head()

In [None]:
train.drop_duplicates('Title', inplace=True)
train.shape

## EDA

### Tag Analysis

In [None]:
# get number of tags for each title
train['Tag_count'] = train['Tags'].apply(lambda x: len(str(x).split()))

In [None]:
train.dropna()
train.shape

In [None]:
train.isnull().sum()

In [None]:
train = train[~train['Tags'].isnull()]
train.shape

### Distribution of tag count

In [None]:
# plot distribution of tag count
fig = plt.figure(figsize=[10,7])
sns.countplot(train['Tag_count'])
plt.title('Distribution of tag count')
plt.ylabel('Frequency')
plt.xlabel('Tag count')
plt.show()

### Get list of all tags

In [None]:
# vectorize tags
tag_vectorizer = CountVectorizer(tokenizer= lambda x: str(x).split())
tag_mat = tag_vectorizer.fit_transform(train['Tags'])

In [None]:
# get names of tags
tag_names = tag_vectorizer.get_feature_names()
type(tag_names), len(tag_names)

In [None]:
tag_names[:10]

In [None]:
# get frequency of each tag
tag_freq = tag_mat.sum(axis=0)
type(tag_freq), tag_freq.A1.shape

In [None]:
# store tag names and frequency as a pandas series
tag_freq_ser = pd.Series(tag_freq.A1, index=tag_names)
tag_freq_ser.sort_values(ascending=False, inplace=True)
tag_freq_ser.head()

### Histogram of tags

In [None]:
# plot distribution of tag frequency
fig = plt.figure(figsize=[10,7])
plt.plot(tag_freq_ser.values,
         c=sns.xkcd_rgb['greenish cyan'])
plt.title('Tag frequency distribution')
plt.ylabel('Frequency')
plt.xlabel('Tag ID')
plt.show()

*  Hard to figure out anything
*  let's plot top 500 Tags

In [None]:
# plot distribution of tag frequency (top 500)
fig = plt.figure(figsize=[10,7])
plt.plot(tag_freq_ser.iloc[:500].values,
         c=sns.xkcd_rgb['greenish cyan'])
plt.title('Tag frequency distribution of top 500 Tags')
plt.ylabel('Frequency')
plt.xlabel('Tag ID')
plt.show()

In [None]:
# plot distribution of tag frequency (top 100)
fig = plt.figure(figsize=[10,7])
plt.plot(tag_freq_ser.iloc[:100].values,
         c=sns.xkcd_rgb['greenish cyan'])
plt.title('Tag frequency distribution of top 100 Tags')
plt.ylabel('Frequency')
plt.xlabel('Tag ID')
plt.show()

### Top words used as tags

In [None]:
# plot word count for tags
wordcloud = WordCloud(background_color='black',
                      max_words=200).generate_from_frequencies(tag_freq_ser)
fig = plt.figure(figsize=[16,16])
plt.title('WordCloud of Tags')
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

### Frequency of top 30 tags

In [None]:
# Plot top 30 tags
fig = plt.figure(figsize=[20,10])
sns.barplot(x=tag_freq_ser.iloc[:50].index,
            y=tag_freq_ser.iloc[:50].values,
           color=sns.xkcd_rgb['greenish cyan'])
plt.title('Frequency of top 50 Tags')
plt.xlabel('Tags')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

## Part 1

## Data Pre-processing

### Clean text data

In [None]:
# clean text data
# remove non alphabetic characters
# remove stopwords and stemming
def clean_text(sentence):
    # remove non alphabetic sequences
    pattern = re.compile(r'[^a-z]+')
    sentence = sentence.lower()
    sentence = pattern.sub(' ', sentence).strip()
    
    # Tokenize
    word_list = word_tokenize(sentence)
    # stop words
    stopwords_list = set(stopwords.words('english'))
    # remove stop words
    word_list = [word for word in word_list if word not in stopwords_list]
    # stemming
    ps  = PorterStemmer()
    word_list = [ps.stem(word) for word in word_list]
    # list to sentence
    sentence = ' '.join(word_list)
    
    return sentence

# clean text data
tqdm.pandas()
train['Title'] = train['Title'].progress_apply(lambda x: clean_text(str(x)))

In [None]:
train.head()

### Reduce number of tags

In [None]:
# calculate number of questions covered by top n tags
def questions_covered(one_hot_tag, ntags):
    # number of questions
    nq = one_hot_tag.shape[0]
    # get number of questions covered by each tag
    tag_sum = one_hot_tag.sum(axis=0).tolist()[0]
    # sort tags based on number of questions covered by them
    tag_sum_sorted = sorted(range(len(tag_sum)),
                            key=lambda x: tag_sum[x],
                            reverse=True)
    # get one hot encoded matrix for top n tags
    one_hot_topn_tag = one_hot_tag[:, tag_sum_sorted[:ntags]]
    # get number of tags per question
    tags_per_question = one_hot_topn_tag.sum(axis=1)
    # get number of question with no tags
    q_with_0_tags = np.count_nonzero(tags_per_question == 0)
    
    return np.round((nq - q_with_0_tags)/nq*100, 2)

# get number of questions covered and tag id list
def questions_covered_list(one_hot_tag, window):
    # number of tags
    ntags = one_hot_tag.shape[1]
    # question id list
    qid_list = np.arange(100, ntags, window)
    # questions covered list
    ques_covered_list = []
    for idx in range(100, ntags, window):
        ques_covered_list.append(questions_covered(one_hot_tag, idx))
        
    return qid_list, ques_covered_list

# get multinomial tag matrix (top n tags)
def topn_tags(one_hot_tag, ntags):
    # get number of questions covered by each tag
    tag_sum = one_hot_tag.sum(axis=0).tolist()[0]
    # sort tags based on number of questions covered by them
    tag_sum_sorted = sorted(range(len(tag_sum)),
                            key=lambda x: tag_sum[x],
                            reverse=True)
    # get one hot encoded matrix for top n tags
    one_hot_topn_tag = one_hot_tag[:, tag_sum_sorted[:ntags]]
    return one_hot_topn_tag

In [None]:
# using bag of words to represent tags for each title
tag_vectorizer = CountVectorizer(tokenizer= lambda x: str(x).split(), binary=True)
y_multinomial = tag_vectorizer.fit_transform(train['Tags'])

In [None]:
x, y = questions_covered_list(y_multinomial, 100)
fig = plt.figure(figsize=[10,7])
plt.title('Questions covered Vs Numbre of Tags')
plt.ylabel('Percentage of Questions covered')
plt.xlabel('Number of Tags')
plt.plot(x,y, c=sns.xkcd_rgb['greenish cyan'])
plt.show()

In [None]:
# print percent of question covered with number of tags
print('#Tags\t%Ques')
for idx in range(500, 7500, 500):
    print(idx, '\t', y[int(idx/100)])

In [None]:
y_multinomial = topn_tags(y_multinomial, 100)

In [None]:
# get index of questions covered
# and remove rest of the data
non_zero_idx = y_multinomial.sum(axis=1) != 0
non_zero_idx = non_zero_idx.A1
y_multinomial = y_multinomial[non_zero_idx,:]
train = train.iloc[non_zero_idx, :]

In [None]:
y_multinomial.shape, train.shape

### Featurize data

In [None]:
# split data in 80-20 ratio
Xtrain, Xtest, Ym_train, Ym_test = train_test_split(train['Title'], y_multinomial, test_size=0.2, random_state=45)

# vectorize text data
tfid_vec = TfidfVectorizer(tokenizer=lambda x: str(x).split())
Xtrain = tfid_vec.fit_transform(Xtrain)
Xtest = tfid_vec.transform(Xtest)

In [None]:
Xtrain.shape, Xtest.shape

In [None]:
Ym_train.shape, Ym_test.shape

## SGDClassifier one vs rest

In [None]:
# create model instance
logreg_model1 = OneVsRestClassifier(SGDClassifier(loss='log',
                                                  alpha=0.001,
                                                  penalty='l1'),
                                   n_jobs=-1)
# train model
logreg_model1.fit(Xtrain, Ym_train)
# predict tags
Ym_test_pred = logreg_model1.predict(Xtest)

# print model performance metrics
print("Accuracy :",metrics.accuracy_score(Ym_test,Ym_test_pred))
print("f1 score macro :",metrics.f1_score(Ym_test,Ym_test_pred, average = 'macro'))
print("f1 scoore micro :",metrics.f1_score(Ym_test,Ym_test_pred, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(Ym_test,Ym_test_pred))
# print("Precision recall report :\n",metrics.classification_report(Ym_test,Ym_test_pred))

*  Kaggle kernels doesn't have sufficient memory to train 5000 (tags to predict) models
*  I trained 100 models and got results shown above, which are not that impressive
*  If you guys have good machine you can try to train 5000 models on it and check their performance
*  you can also try other models like logistic regression, SVM, randomforest etc.

## Part 2

## Bidirectional LSTM

### Data Prepration

*  Let's just prepare our X variable 
*  we can use y variable generated in part 1
*  Data is already clean we just need to generate word to number and vice versa
*  use that map to encode word sequences and then embed them

In [None]:
# tokenize words in title
t = Tokenizer(num_words=20000)
t.fit_on_texts(train['Title'].to_list())

In [None]:
# word to number and vice versa map
w2num = t.word_index
num2w = {k:w for w, k in w2num.items()}

In [None]:
# replace words with numbers in docs
docs = train['Title'].to_list()
docs2 = []
for doc in docs:
    
    lst = []
    
    for word in doc.split():
        lst.append(w2num[word])
        
    docs2.append(lst)

In [None]:
# pad sequences
docs2 = pad_sequences(docs2)

In [None]:
Xtrain, Xcv, Ytrain, Ycv = train_test_split(docs2, y_multinomial, random_state=21, test_size=0.25)

### Initialize model

In [None]:
# initialize BiLSTM model
model = Sequential()
# embedding layer
model.add(Embedding(20000, 256, input_length=27))
# BiLSTM layer 1
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.4))
# BiLSTM layer 2
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.4))
model.add(Flatten())
model.add(Dense(100, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# train LSTM

In [None]:
model.fit(Xtrain, Ytrain, epochs=10, batch_size=1024, verbose=1)

In [None]:
# predict tags
Ycv_pred = model.predict(Xcv)
Ycv_pred = (Ycv_pred > 0.5).astype('int64')
# print model performance metrics
print("Accuracy :",metrics.accuracy_score(Ycv,Ycv_pred))
print("f1 score macro :",metrics.f1_score(Ycv,Ycv_pred, average = 'macro'))
print("f1 scoore micro :",metrics.f1_score(Ycv,Ycv_pred, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(Ycv,Ycv_pred))

*  Results are better compared to SGD but not good enough
*  To make it better first thing we need is a good machine
*  Next we can train for more epochs
*  If you guys have patience and resources you can also try to include "body" of the query, here I've just used titles
*  Another thing to try out is BiLSTM with attnetion, that might help a lot