In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import sqlite3
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import re
import os
import gensim
from sqlalchemy import create_engine 
import datetime as dt
import joblib

#import nltk
#nltk.download('stopwords')
#import nltk
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from datetime import datetime

# Taxonomy Creation: Stack ExchangeTag Prediction

In [2]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
 
    return None

def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)
        
def checkTableExists(dbcon):
    cursr = dbcon.cursor()
    str = "select name from sqlite_master where type='table'"
    table_names = cursr.execute(str)
    print("Tables in the database:")
    tables =table_names.fetchall() 
    print(tables[0][0])
    return(len(tables))

def create_database_table(database, query):
    conn = create_connection(database)
    if conn is not None:
        create_table(conn, query)
        checkTableExists(conn)
    else:
        print("Error! cannot create the database connection.")
    conn.close()


In [29]:
#Taking entries to a dataframe from our saved data
write_db = 'Titlemoreweight.db'
if os.path.isfile(write_db):
    conn_r = create_connection(write_db)
    if conn_r is not None:
        preprocessed_data = pd.read_sql_query("""SELECT question, Tags FROM QuestionsProcessed LIMIT 3000000""", conn_r)
conn_r.commit()
conn_r.close()

In [30]:
print(preprocessed_data.shape)
preprocessed_data.head()

(2999999, 2)


Unnamed: 0,question,tags
0,dynam datagrid bind silverlight dynam datagrid...,c# silverlight data-binding
1,dynam datagrid bind silverlight dynam datagrid...,c# silverlight data-binding columns
2,java.lang.noclassdeffounderror javax servlet j...,jsp jstl
3,java.sql.sqlexcept microsoft odbc driver manag...,java jdbc
4,better way updat feed fb php sdk better way up...,facebook api facebook-php-sdk


In [31]:
print("number of data points in sample :", preprocessed_data.shape[0])
print("number of dimensions :", preprocessed_data.shape[1])

number of data points in sample : 2999999
number of dimensions : 2


__ Converting string Tags to multilable output variables __ 

In [32]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
multilabel_y = vectorizer.fit_transform(preprocessed_data['tags'])

__ Selecting 400 Tags __

In [33]:
def tags_to_choose(n):
    t = multilabel_y.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
    multilabel_yn=multilabel_y[:,sorted_tags_i[:n]]
    return multilabel_yn

def questions_explained_fn(n):
    multilabel_yn = tags_to_choose(n)
    x= multilabel_yn.sum(axis=1)
    return (np.count_nonzero(x==0))

In [34]:
no_of_tags = 400
total_tags=multilabel_y.shape[1]
total_qs=preprocessed_data.shape[0]
ques_explained = questions_explained_fn(no_of_tags)
# we will be taking 5000 tags
multilabel_yx = tags_to_choose(no_of_tags)
print("number of questions that are not covered ", ques_explained ,"out of ", total_qs)
print("With ",no_of_tags,"tags we are covering ",np.round(((total_qs-ques_explained)/total_qs)*100,3),"% of questions")

number of questions that are not covered  346542 out of  2999999
With  400 tags we are covering  88.449 % of questions


In [35]:
train_datasize = 2000000
# Spliting the dataset into train and test
x_train=preprocessed_data.head(train_datasize)
x_test=preprocessed_data.tail(preprocessed_data.shape[0] - train_datasize)

y_train = multilabel_yx[0:train_datasize,:]
y_test = multilabel_yx[train_datasize:preprocessed_data.shape[0],:]

In [36]:
print("Number of data points in train data :", y_train.shape[0])
print("Number of data points in test data :", y_test.shape[0])

Number of data points in train data : 2000000
Number of data points in test data : 999999


## 4. Machine Learning Models 

### Converting into vectors

In [37]:
start = datetime.now()
# converting data into BoW Vectorizer
count_vect = CountVectorizer(ngram_range=(1,2), max_features=1000)
x_train_multilabel = count_vect.fit_transform(x_train['question'])
x_test_multilabel = count_vect.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:14:26.649355


In [38]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)

Dimensions of train data X: (2000000, 1000) Y : (2000000, 400)
Dimensions of test data X: (999999, 1000) Y: (999999, 400)


## 4.1 Applying  MultinomialNB with OneVsRest Classifier

In [39]:
from sklearn.naive_bayes import MultinomialNB

start = datetime.now()

classifier_1 = OneVsRestClassifier(MultinomialNB(alpha = 0.001))
classifier_1.fit(x_train_multilabel, y_train)
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:11:09.650740


In [40]:
predictions_1 = classifier_1.predict(x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions_1))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions_1))

precision = precision_score(y_test, predictions_1, average='micro')
recall = recall_score(y_test, predictions_1, average='micro')
f1 = f1_score(y_test, predictions_1, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions_1, average='macro')
recall = recall_score(y_test, predictions_1, average='macro')
f1 = f1_score(y_test, predictions_1, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions_1))

Accuracy : 0.002797002797002797
Hamming loss  0.05820874320874321
Micro-average quality numbers
Precision: 0.0585, Recall: 0.8198, F1-measure: 0.1092
Macro-average quality numbers
Precision: 0.0479, Recall: 0.8228, F1-measure: 0.0855
              precision    recall  f1-score   support

           0       0.24      0.62      0.34     69974
           1       0.39      0.77      0.51     80038
           2       0.36      0.86      0.51     87470
           3       0.32      0.84      0.47     63908
           4       0.35      0.84      0.50     40510
           5       0.29      0.90      0.44     44635
           6       0.13      0.72      0.23     27645
           7       0.29      0.84      0.43     39786
           8       0.22      0.70      0.33     23977
           9       0.29      0.93      0.44     45546
          10       0.12      0.57      0.19     23442
          11       0.15      0.80      0.26     24200
          12       0.23      0.72      0.35     23686
         

In [41]:
filename = 'NB_model.sav'
joblib.dump(classifier_1, filename)

['NB_model.sav']

In [46]:
# load the model from disk
filename = 'NB_model.sav'
classifier_1 = joblib.load(filename)

In [50]:
# example : output from the trained model
q_no = 101
ques_with_imp = dict()
tag_names = vectorizer.get_feature_names()

predictions = classifier_1.predict(x_test_multilabel[q_no])
pred_prob = (classifier_1.predict_proba(x_test_multilabel[q_no])[0])

print("Question ",q_no,": \n",list(x_test['question'])[q_no])

print("\nTaxonomy created (Predicted Tags with their importance ) : ")
pred_indx = np.nonzero(predictions.toarray()[0])

for i in pred_indx[0]:
    if pred_prob[i]*100 > 80:
        ques_with_imp[tag_names[i]] = np.round(pred_prob[i]*100,2)
ques_with_imp = sorted(ques_with_imp.items(), key=lambda kv: kv[1],reverse=True)

for i,j in ques_with_imp:
    print(j,"\t\t",i)
    
print("\nTrue Tags:")
test_indx = np.nonzero(y_test.toarray()[q_no])
for i in test_indx[0]:
    print(tag_names[i])

Question  101 : 
 java getconstructor java getconstructor java getconstructor wrote question comment code think easier understand way suggest thank advanc

Taxonomy created (Predicted Tags with their importance ) : 
99.96 		 .app

 True Tags:
.app
8087cw


## 4.2. Applying GridSearchCV for Logistic Regression with OneVsRest Classifier

#### Hyperparam tuning on lambda for Logistic regression

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

param = {"estimator__C": [10**-4, 10**-2, 10**0, 10**2]}
n_folds = 2
classifier_2 = OneVsRestClassifier(LogisticRegression(penalty='l1'))
rndm = RandomizedSearchCV(estimator=classifier_2, param_distributions=param, cv=n_folds, scoring='f1_micro',verbose=2, n_jobs=-1)
rndm.fit(x_train_multilabel, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed: 22.0min remaining:  7.3min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 74.9min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=OneVsRestClassifier(estimator=LogisticRegression(C=1.0,
                                                                              class_weight=None,
                                                                              dual=False,
                                                                              fit_intercept=True,
                                                                              intercept_scaling=1,
                                                                              l1_ratio=None,
                                                                              max_iter=100,
                                                                              multi_class='warn',
                                                                              n_jobs=None,
                                                                              penalty='l1',
       

In [15]:
print("Best estimator for the model :\n ",rndm.best_estimator_)
print("Best Score for the model : ",rndm.best_score_)

Best estimator for the model :
  OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l1',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)
Best Score for the model :  0.36279099485260147


In [16]:
#save model
joblib.dump(classifier_2, 'LR_model') 

['LR_model']

In [13]:
# load the model from disk
filename = 'LR_model'
classifier_2 = joblib.load(filename)

In [15]:
classifier_2.fit(x_train_multilabel, y_train)

predictions_2 = classifier_2.predict(x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions_2))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions_2))


precision = precision_score(y_test, predictions_2, average='micro')
recall = recall_score(y_test, predictions_2, average='micro')
f1 = f1_score(y_test, predictions_2, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions_2, average='macro')
recall = recall_score(y_test, predictions_2, average='macro')
f1 = f1_score(y_test, predictions_2, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions_2))

Accuracy : 0.1472
Hamming loss  0.00436925
Micro-average quality numbers
Precision: 0.5162, Recall: 0.2363, F1-measure: 0.3242
Macro-average quality numbers
Precision: 0.2739, Recall: 0.1665, F1-measure: 0.1918
              precision    recall  f1-score   support

           0       0.81      0.35      0.49       820
           1       0.49      0.03      0.06      1931
           2       0.50      0.12      0.19       544
           3       0.63      0.18      0.29       222
           4       0.80      0.43      0.56      1311
           5       0.84      0.44      0.58      1014
           6       0.76      0.32      0.45      1374
           7       0.82      0.53      0.65       702
           8       0.88      0.55      0.68      1424
           9       0.76      0.09      0.16      1037
          10       0.61      0.25      0.35       797
          11       0.67      0.37      0.47       156
          12       0.57      0.44      0.50        36
          13       0.73      0.3

In [28]:
# example : output from the trained model
q_no = 0
ques_with_imp = dict()
tag_names = vectorizer.get_feature_names()

predictions = classifier_2.predict(x_test_multilabel[q_no])
pred_prob = (classifier_2.predict_proba(x_test_multilabel[q_no])[0])

print("Question ",q_no,": \n",list(x_test['question'])[q_no])

print("\nTaxonomy created (Predicted Tags with their importance ) : ")
pred_indx = np.nonzero(predictions.toarray()[0])

for i in pred_indx[0]:
    ques_with_imp[tag_names[i]] = np.round(pred_prob[i]*100,2)
ques_with_imp = sorted(ques_with_imp.items(), key=lambda kv: kv[1],reverse=True)

for i,j in ques_with_imp:
    print(j,"\t\t",i)
    
print("\n True Tags:")
test_indx = np.nonzero(y_test.toarray()[q_no])
for i in test_indx[0]:
    print(tag_names[i])

Question  0 : 
 abort socket oper window phone abort socket oper window phone abort socket oper window phone use pseudo-synchron socket window phone applic socket code base sampl http msdn.microsoft.com en-us librari hh202858 vs.92 .aspx server send pattern somewhat unpredict start fixed-s header contain length rest messag first read header read specifi number byte socket sinc need send messag server well attempt duplex socket thread receiv anoth thread send caus lot problem loop like code work fine major time strang thing happen messag null nbecaus timeout receiv method see link sampl use manualresetev receiv request socket never actual cancel even though method return request wait around somewher data avail socket chomp header event handler noth data receiv sinc method return variabl method never use data basic disappear read request expect return header skip read byte header idea long messag like abl cancel outstand request socket time use anonym method like sampl sinc simplifi ever

# Conclusion

In [51]:
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = [ "Model","hyperparameter", "Accuracy","HammingLoss" , "MicroAvg F1-Score","MacroAvg F1-Score"]
table.add_row(["MultinomialNB OneVsRestClassifier","alpha = 0.001", "0.0027" , "0.05820", "0.1092","0.0855"])
table.add_row(["Logistic Regression OneVsRestClassifier","C = 1"," 0.1472","0.0043","0.3242","0.1918"])
print(table)

+-----------------------------------------+----------------+----------+-------------+-------------------+-------------------+
|                  Model                  | hyperparameter | Accuracy | HammingLoss | MicroAvg F1-Score | MacroAvg F1-Score |
+-----------------------------------------+----------------+----------+-------------+-------------------+-------------------+
|    MultinomialNB OneVsRestClassifier    | alpha = 0.001  |  0.0027  |   0.05820   |       0.1092      |       0.0855      |
| Logistic Regression OneVsRestClassifier |     C = 1      |  0.1472  |    0.0043   |       0.3242      |       0.1918      |
+-----------------------------------------+----------------+----------+-------------+-------------------+-------------------+


<h1> Step by step procedure:</h1>

<h2> <font color='blue'>1.  Business Problem: </font></h2>
It covers the basic details which should be known before solving the case study.<br>
<p>
**1.1. Description:** describes the background details of the StackOverFlow website which is must to know to get the insights.<br>
**1.2. Problem Statemtent:** describes the problem which we are intended to solve.<br>
**1.3. Real World / Business Objectives and Constraints:** describes the objectives which we have to keep in mind while solving the problem. We need to give proper attention towards the constraints stated under this.
</p>

<h2> <font color='blue'>2. Machine Learning problem:</font></h2>
Looking into the problem as a Machine learning problem.
<p>
**2.1 Data Overview:** Understanding the data and the data fields.<br>
**2.2 Mapping the real-world problem to a Machine Learning Problem:** <br>
_2.2.1 Type of Machine Learning Problem:_ Understand the type of problem i.e. classification (binary classification, Multi-class classification, Multi-label classification), regression, etc<br>
_2.2.2 Performance Metric:_ Percieve the appropriate metric for this problem.
</p>


<h2> <font color='blue'>3. Exploratory Data Analysis:</font></h2>
<p>
**3.1 Data Loading and Cleaning **<br>
*3.1.1 Using Pandas with SQLite to Load the data:* As the size of dataset is too large thus using SQLite for faster implementation.<br>
*3.1.2 Counting the number of rows:* understanding data better<br>
*3.1.3 Checking for duplicates:* removing any duplicates if present<br>
</p>
<p>
**3.2 Analysis of Tags **<br>
*3.2.1 Total number of unique tags: * getting number of unique tags to use them as class labels.<br>
*3.2.2 Number of times a tag appeared:* get the frequencies for each tag<br>
*3.2.3 Most frequent tag:* keep the most frequent tag and ignore the others for better model.<br>
*3.2.4 Top 20 Tags:* to show the top 20 tags in the corpus
</p>
<p>
**3.3 Analysis of Titles**<br>
*3.3.1 Similarity between the words in Title and the Tags*
</p>
<p>
**3.4 Preprocessing** <br>
<ol>
<li> Sample data points</li>
<li> Separate out code-snippets from Body </li>
<li> Remove Special characters from Question title and description (not in code)</li>
<li> Remove stop words (Except 'C')</li>
<li> Remove HTML Tags</li>
<li> Convert all the characters into small letters</li>
<li> Use SnowballStemmer to stem the words</li>
</ol>
</p>

<h2> <font color='blue'> 4. Machine Learning Models</font></h2>
<br>
**Converting tags for multilabel problems:** Each tag is to be converted to a label. This allows us to apply Multi-Label Classification problem<br>
**4.1 Applying MultinomialNB with OneVsRest Classifier** <br>
**4.2. Applying GridSearchCV for Logistic Regression with OneVsRestClassifier**<br>
<p> As we are dealing with high dimension data and using OneVsRestClassifier thus Linear models work best.</p>