# Predicting Tags for Questions in Stack Overflow

## Contents

* [Loading Data](#loadingData)
* [Data Preprocessing](#dataPreprocessing)
    * [Basic Data Analysis on Tags](#tagAnalysis)
* [Text Processing](#textProcessing)
* [Supervised ML Models](#supervisedModels)
    * [Logistic Regresssion](#logisticRegression)
    * [SGD Classifier](#SGD)
    * [Multilabel KNN](#MLKNN)
* [Unsupervised Learning Algorithms](#unsupervised)
    * [K-Means Clustering](#kmeans)
    * [Mean Shift](#meanShift)
* [References](#references)
   

In [None]:
# Importing necessary libraries

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from skmultilearn.adapt import MLkNN

from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score, silhouette_score

import warnings
warnings.filterwarnings("ignore")

<div id="loadingData">
<h2>Loading Data</h2>
</div>

In [None]:
# Loading the data into a pandas dataframe
df = pd.read_csv("/kaggle/input/facebook-recruiting-iii-keyword-extraction/Train.zip")
df.head()

In [None]:
# Prints the shape of the dataframe, i.e,  the number of rows and columns in the dataset
print("Dataframe shape : ", df.shape)

In [None]:
'''
Since the number of records in the data is very large(60,34,195) so 
let's consider a small subset of data for faster computing.

.iloc[] Purely integer-location based indexing for selection by position.
'''

df = df.iloc[:10000, :] # selecting the first 10000 rows and all columns of the dataset
print("Shape of Dataframe after subsetting : ", df.shape)

<div id="dataPreprocessing">
<h2>Data Preprocessing</h2>
</div>

In [None]:
'''
Checking for duplicates and removing them

Pandas duplicated() method helps in analyzing duplicate values only. It returns 
a boolean series which is false only for Unique elements and first occurence of
duplicate elements by default.

'''

duplicate_pairs = df.sort_values('Title', ascending=False).duplicated('Title')
print("Total number of duplicate questions : ", duplicate_pairs.sum())
df = df[~duplicate_pairs] # passing NOT of bool series to see unique values only 
print("Dataframe shape after duplicate removal : ", df.shape)

In [None]:
# Create a new column in the dataframe with the count of tags for each question
df["tag_count"] = df["Tags"].apply(lambda x : len(x.split()))
df.head()

<div id="tagAnalysis">
<h3>Basic Data Analysis on Tags</h3>
</div>

In [None]:
'''
Frequency of tag_count

.value_counts() Return a Series containing counts of unique values.
'''

df["tag_count"].value_counts()

In [None]:
print( "Maximum number of tags in a question: ", df["tag_count"].max())
print( "Minimum number of tags in a question: ", df["tag_count"].min())
print( "Average number of tags in a question: ", df["tag_count"].mean())

In [None]:
# Plotting a graph showing the frequency of each count of tags
sns.countplot(df["tag_count"])
plt.title("Number of tags in questions ")
plt.xlabel("Number of Tags")
plt.ylabel("Frequency")

**Observations**:
1. Maximum number of tags in a question: **5**
2. Minimum number of tags in a question: **1**
3. Average number of tags per question: **2.92**
4. Most of the questions have either 2 or 3 tags

In [None]:
'''
CountVectorizer tokenizes the text along with performing very basic preprocessing like removing the
punctuation marks, converting all the words to lowercase, etc.

fit_transform()
Learn the vocabulary dictionary and return document-term matrix.
'''

vectorizer = CountVectorizer(tokenizer = lambda x: x.split())
tag_bow = vectorizer.fit_transform(df['Tags'])

In [None]:
print("Number of questions :", tag_bow.shape[0])
print("Number of unique tags :", tag_bow.shape[1])

In [None]:
tags = vectorizer.get_feature_names()
print("Some of the tags :", tags[:10])

#### Frequency of each tag

In [None]:
'''
axis=0 Column-wise sum
.A1 returns a falttened numpy array
'''

freq = tag_bow.sum(axis=0).A1
tag_to_count_map = dict(zip(tags, freq))

In [None]:
list = []
for key, value in tag_to_count_map.items():
  list.append([key, value]) 

In [None]:
tag_df = pd.DataFrame(list, columns=['Tags', 'Counts'])
tag_df.head()

In [None]:
tag_df_sorted = tag_df.sort_values(['Counts'], ascending=False)
plt.plot(tag_df_sorted['Counts'].values)
plt.grid()
plt.title("Distribution of frequency of tags based on appeareance")
plt.xlabel("Tag numbers for most frequent tags")
plt.ylabel("Frequency")

In [None]:
plt.plot(tag_df_sorted['Counts'][0:100].values)
plt.grid()
plt.title("Top 100 tags : Distribution of frequency of tags based on appeareance")
plt.xlabel("Tag numbers for most frequent tags")
plt.ylabel("Frequency")

In [None]:
plt.plot(tag_df_sorted['Counts'][0:100].values)
plt.scatter(x=np.arange(0,100,5), y=tag_df_sorted['Counts'][0:100:5], c='g', label="quantiles with 0.05 intervals")
plt.scatter(x=np.arange(0,100,25), y=tag_df_sorted['Counts'][0:100:25], c='r', label = "quantiles with 0.25 intervals")
for x,y in zip(np.arange(0,100,25), tag_df_sorted['Counts'][0:100:25]):
    plt.annotate(s="({} , {})".format(x,y), xy=(x,y), xytext=(x-0.01, y+30))

plt.title('first 100 tags: Distribution of frequency of tags based on appeareance')
plt.grid()
plt.xlabel("Tag numbers for most frequent tags")
plt.ylabel("Frequency")
plt.legend()

In [None]:
print("{} tags are used more than 25 times".format(tag_df_sorted[tag_df_sorted["Counts"]>25].shape[0]))
print("{} tags are used more than 50 times".format(tag_df_sorted[tag_df_sorted["Counts"]>50].shape[0]))

**Observations**:
1. 144 tags are used more than 25 times.
2. 59 tags are used more than 50 times.
3. C# is most frequently used tag 778 times.
4. Since some tags occur much more frequenctly than others, Micro-averaged F1-score is the appropriate metric for this problem.

Word map for most frequent Tags

In [None]:
tag_to_count_map
tupl = dict(tag_to_count_map.items())
word_cloud = WordCloud(width=1600,height=800,).generate_from_frequencies(tupl)
plt.figure(figsize = (12,8))
plt.imshow(word_cloud)
plt.axis('off')
plt.tight_layout(pad=0)

**Observations**:

"c#", "java", "php", "android", "javascript", "jquery", "C++" are some of the most frequent tags.

Bar plot of top 20 tags

In [None]:
i=np.arange(20)
tag_df_sorted.head(20).plot(kind='bar')
plt.title('Frequency of top 20 tags')
plt.xticks(i, tag_df_sorted['Tags'])
plt.xlabel('Tags')
plt.ylabel('Counts')
plt.show()

<div id="textProcessing">
    <h2>Text Processing</h2>
</div>

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

In [None]:
qus_list=[]
qus_with_code = 0
len_before_preprocessing = 0 
len_after_preprocessing = 0 
for index,row in df.iterrows():
    title, body, tags = row["Title"], row["Body"], row["Tags"]
    if '<code>' in body:
        qus_with_code+=1
    len_before_preprocessing+=len(title) + len(body)
    body=re.sub('<code>(.*?)</code>', '', body, flags=re.MULTILINE|re.DOTALL)
    body = re.sub('<.*?>', ' ', str(body.encode('utf-8')))
    title=title.encode('utf-8')
    question=str(title)+" "+str(title)+" "+str(title)+" "+ body
    question=re.sub(r'[^A-Za-z]+',' ',question)
    words=word_tokenize(str(question.lower()))
    question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c'))
    qus_list.append(question)
    len_after_preprocessing += len(question)
df["question_with_title"] = qus_list
avg_len_before_preprocessing=(len_before_preprocessing*1.0)/df.shape[0]
avg_len_after_preprocessing=(len_after_preprocessing*1.0)/df.shape[0]
print( "Avg. length of questions(Title+Body) before preprocessing: ", avg_len_before_preprocessing)
print( "Avg. length of questions(Title+Body) after preprocessing: ", avg_len_after_preprocessing)
print ("% of questions containing code: ", (qus_with_code*100.0)/df.shape[0])

In [None]:
preprocessed_df = df[["question_with_title","Tags"]]
print("Shape of preprocessed data :", preprocessed_df.shape)

In [None]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
y_multilabel = vectorizer.fit_transform(preprocessed_df['Tags'])

In [None]:
def tags_to_consider(n):
    tag_i_sum = y_multilabel.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(tag_i_sum)), key=lambda i: tag_i_sum[i], reverse=True)
    yn_multilabel = y_multilabel[:,sorted_tags_i[:n]]
    return yn_multilabel

def questions_covered_fn(numb):
    yn_multilabel = tags_to_consider(numb)
    x = yn_multilabel.sum(axis=1)
    return (np.count_nonzero(x==0))

In [None]:
questions_covered = []
total_tags = y_multilabel.shape[1]
total_qus = preprocessed_df.shape[0]
for i in range(100, total_tags, 100):
    questions_covered.append(np.round(((total_qus-questions_covered_fn(i))/total_qus)*100,3))

In [None]:
plt.plot(np.arange(100,total_tags, 100),questions_covered)
plt.xlabel("Number of tags")
plt.ylabel("Number of questions covered partially")
plt.grid()
plt.show()
print(questions_covered[9],"% of questions covered by 1000 tags")
print("Number of questions that are not covered by 100 tags : ", questions_covered_fn(1000),"out of ", total_qus)

In [None]:
yx_multilabel = tags_to_consider(1000)
print("Number of tags in the subset :", y_multilabel.shape[1])
print("Number of tags considered :", yx_multilabel.shape[1],"(",(yx_multilabel.shape[1]/y_multilabel.shape[1])*100,"%)")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_df, yx_multilabel, test_size = 0.2,random_state = 42)
print("Number of data points in training data :", X_train.shape[0])
print("Number of data points in test data :", X_test.shape[0])

In [None]:
vectorizer = TfidfVectorizer(min_df=0.009, max_features=200000, tokenizer = lambda x: x.split(), ngram_range=(1,3))
X_train_multilabel = vectorizer.fit_transform(X_train['question_with_title'])
X_test_multilabel = vectorizer.transform(X_test['question_with_title'])

In [None]:
print("Training data shape X : ",X_train_multilabel.shape, "Y :",y_train.shape)
print("Test data shape X : ",X_test_multilabel.shape,"Y:",y_test.shape)

In [None]:
def print_score(y_test, y_pred):
    print("Accuracy :",metrics.accuracy_score(y_test,y_pred))
    print("Macro f1 score :",metrics.f1_score(y_test, y_pred, average = 'macro'))
    print("Micro f1 scoore :",metrics.f1_score(y_test, y_pred, average = 'micro'))
    print("Hamming loss :",metrics.hamming_loss(y_test,y_pred))

<div id="supervisedModels">
    <h2>Supervised ML Models</h2>
</div>

<div id="logisticRegression">
    <h3>Logistic Regression</h3>
</div>

In [None]:
#using direct implementation of Logistic Regression
clf2 = OneVsRestClassifier(LogisticRegression(penalty='l2'))
clf2.fit(X_train_multilabel, y_train)
y_pred2 = clf2.predict(X_test_multilabel)
print_score(y_test, y_pred2)

In [None]:
 class LogisticRegression(object):
    
    def __init__(Logreg, alpha=0.01, n_iteration=100):   
        Logreg.alpha = alpha                            
        Logreg.n_iter = n_iteration
        
    def _sigmoid_function(Logreg, x): #This function is resonsible for calculating the sigmoid value with given parameter
        value = 1 / (1 + np.exp(-x))
        return value
    def _cost_function(Logreg,h,theta, y): # The fuctions calculates the cost value
        m = len(y)
        cost = (1 / m) * (np.sum(-y.T.dot(np.log(h)) - (1 - y).T.dot(np.log(1 - h))))
        return cost
    
    def _gradient_descent(Logreg,X,h,theta,y,m): # This function calculates the theta value by gradient descent
        gradient_value = np.dot(X.T, (h - y)) / m
        theta -= Logreg.alpha * gradient_value
        return theta

    def predict(Logreg, X): # this function calls the max predict function to classify the individul feauter
        X = np.insert(X, 0, 1, axis=1)
        X_predicted = [max((Logreg._sigmoid_function(i.dot(theta)), c) for theta, c in Logreg.theta)[1] for i in X ]
        return X_predicted
    

In [None]:
logi = LogisticRegression(n_iteration=30000).fit(X_train_multilabel, y_train)
y_pred1 = logi.predict(X_test_multilabel)
print_score(y_test, y_pred1)

<div id="SGD">
    <h3>SGD Classifier</h3>
</div>

In [None]:
from datetime import datetime

# #hyper-parameter tuning of alpha
start = datetime.now()
alpha = [10**-8, 10**-6, 10**-4, 10**-2]
for i in alpha:
    classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=i, penalty='l1'), n_jobs=-1)
    classifier.fit(X_train_multilabel, y_train)
    predictions = classifier.predict(X_test_multilabel)
    print("For alpha value = {}, Micro f1 score = {}".format(i, f1_score(y_test, predictions, average = 'micro')))

print("Total Time taken = {}".format(datetime.now() - start))

In [None]:
#training classifier with best alpha
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=10**-4, penalty='l1'), n_jobs=-1)
classifier.fit(X_train_multilabel, y_train)
predictions = classifier.predict(X_test_multilabel)

print_score(y_test, predictions)

print("Total training time: {}".format(datetime.now() - start))

<div id="MLKNN">
    <h3>Multilabel KNN</h3>
</div>

In [None]:
knn = MLkNN(k=10)
knn.fit(X_train_multilabel, y_train)
y_pred4 = knn.predict(X_test_multilabel)
print_score(y_test, y_pred4)

In [None]:
names =["LR","SGD","KNN"]
values = [metrics.f1_score(y_test, y_pred2, average = 'micro')*100, metrics.f1_score(y_test, y_pred4, average = 'micro')*100, metrics.f1_score(y_test, predictions, average = 'micro')*100]
plt.ylim(0,50)
plt.bar(names,values)
values

<div id="unsupervised">
    <h2>Unsupervised Learning Algorithms</h2>
</div>

<div id="kmeans">
    <h3>K-Means clustering</h3>
</div>

In [None]:
sil = []
kmax = 10
for k in range(2, kmax+1):
   kmeans = KMeans(n_clusters = k).fit(X_train_multilabel)
   labels = kmeans.labels_
   sil.append(silhouette_score(X_train_multilabel, labels, metric = 'euclidean'))
        
maxpos = sil.index(max(sil))
n_clusters = maxpos + 2

In [None]:
# Divide into k groups using k-mean clustering
model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=100)
model.fit(X_train_multilabel)
    
# plot the centroids
plt.scatter(model.cluster_centers_[:, 0], model.cluster_centers_[:, 1],s=250, marker='*',c='red', edgecolor='black',label='centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.show()

<div id="meanShift">
    <h3>Mean Shift</h3>
</div>

In [None]:
from sklearn.cluster import MeanShift
ms = MeanShift()
ms.fit(X_train_multilabel[:3000,].toarray())
cluster_centers = ms.cluster_centers_

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_train_multilabel[:,0], X_train_multilabel[:,1], X_train_multilabel[:,2], marker='o')
ax.scatter(cluster_centers[:,0], cluster_centers[:,1], cluster_centers[:,2], marker='x', color='red', s=300, linewidth=5, zorder=10)
plt.show()

<div id="references">
    <h2>References</h2>
</div>

- https://www.kaggle.com/vikashrajluhaniwal/multi-label-classification-for-tag-predictions
- https://github.com/gauravtheP/Stackoverflow-Tag-Prediction/blob/master/SO-Tag-Prediction/SO-Tag-Prediction.ipynb