# Loading data

In [1]:
import pandas as pd
df=pd.read_csv('stackoverflowtags.csv')
df.head(2)

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"


In [2]:
df.title.value_counts() # counts the number of occurances in the data

#NAME?                                                                      5
Conversion failed when converting date and/or time from character string    3
Input string was not in a correct format                                    2
Could not find default endpoint element that references contract            2
Object reference not set to an instance of an object                        2
                                                                           ..
Game trees and data structures to implement them in java?                   1
NSPredicate predicateWithFormat:(NSString*) inconsistency?                  1
document.location.href is not working in IE                                 1
segmentation fault in C, core dumped, gdb output                            1
Create a List of primitive int?                                             1
Name: title, Length: 99980, dtype: int64

# Droping Duplicates

In [3]:
df.title=df.title.drop_duplicates(keep='first') # drops the duplicates
df.title.value_counts()

How to draw a stacked dotplot in R?                                            1
Hibernate JPA - ManyToOne relationship not populated                           1
how to work treeview nodes contains in asp.net?                                1
Initialize script                                                              1
ASP.NET C# Must declare the scalar variable                                    1
                                                                              ..
Can jquery animate the css background property?                                1
how to access variable from php in jquery using ajax(here i am using Json)     1
Best way to handle session management in spring mvc filters or interceptors    1
Can't run the .java file                                                       1
Create a List of primitive int?                                                1
Name: title, Length: 99980, dtype: int64

# Text Cleaning

In [4]:
import re       # importing regular expressions used for cleaning texts

# importing natural language toolkit
# that helps in cleaning texts by using
# stopwords, SnowballStemmer, WordNetLemmatizer libraries

from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# function to remove html tags and other 
# unwanted stuff in the question asked by the user

def clean(s):
    s=str(s)
    s=s.lower()
    html=re.compile('<.*?>')   #removing html tags
    cleaned = re.sub(html,' ',s)
    fil=[]
    for i in cleaned.split():    # splits the text and repalces the unwanted characters with ''
        if i!='c++':
            cleaned=re.sub('[^A-Za-z]', '', i) #search the pattern !(A-Z & a-z) and replace with ''
            fil.append(cleaned)
        else:
            fil.append(i)
    return fil         # returns the splitted text with removed stopwords and html tags

stop=set(stopwords.words('english'))  #loading stopwords in english to compare and remove
sno=SnowballStemmer('english')
clean(df.title[0])                # function call to clean the text

['how', 'to', 'draw', 'a', 'stacked', 'dotplot', 'in', 'r']

# Steaming

In [5]:
# function to stem the data
# stemming means grouping the words
# after cleaning the data for further processing

def stem(s):
    fil=[]
    for i in s:
        if i not in stop:
            s=(sno.stem(i).encode('utf8'))  # encoding the data into a clean file
            fil.append(s)
    s=b' '.join(fil)
    return s

In [6]:
# creating the new columns 
# cleaned questions and cleaned tags
# after cleaning the data
ques=[]
for j in df.title:
    ques.append(stem(clean(j)))
df['cleanQues'] = ques

import re
ctags=[]
for i in df.tags:
    ctags.append(re.sub('[^A-Za-z#+-]', ' ', i)) # search the pattern !(A-Z & a-z) and replace with ''
df['cleanTags']=ctags

# After cleaning and stemming

In [7]:
df.head(10)

Unnamed: 0,title,tags,cleanQues,cleanTags
0,How to draw a stacked dotplot in R?,['r'],b'draw stack dotplot r',r
1,mysql select all records where a datetime fiel...,"['php', 'mysql']",b'mysql select record datetim field less speci...,php mysql
2,How to terminate windows phone 8.1 app,['c#'],b'termin window phone app',c#
3,get current time in a specific country via jquery,"['javascript', 'jquery']",b'get current time specif countri via jqueri',javascript jquery
4,Configuring Tomcat to Use SSL,['java'],b'configur tomcat use ssl',java
5,Awesome nested set plugin - how to add new chi...,['ruby-on-rails'],b'awesom nest set plugin add new children tre...,ruby-on-rails
6,How to create map from JSON response in Ruby o...,"['ruby', 'ruby-on-rails-3', 'json']",b'creat map json respons rubi rail ',ruby ruby-on-rails- json
7,rspec test if method is called,['ruby'],b'rspec test method call',ruby
8,SpringBoot Catalina LifeCycle Exception,"['java', 'spring', 'spring-mvc']",b'springboot catalina lifecycl except',java spring spring-mvc
9,How to import data from excel to mysql databas...,"['php', 'codeigniter']",b'import data excel mysql databas use php',php codeigniter


In [8]:
# creating the new dataset that only 
# consists cleaned questions and tags

d=pd.DataFrame()
d['text']=df.cleanQues
d['tags']=df.cleanTags
d.to_csv('datafinal',index=False)
df = pd.read_csv('datafinal')
df.head()

Unnamed: 0,text,tags
0,b'draw stack dotplot r',r
1,b'mysql select record datetim field less speci...,php mysql
2,b'termin window phone app',c#
3,b'get current time specif countri via jqueri',javascript jquery
4,b'configur tomcat use ssl',java


# Splitting Datasets

In [9]:
# splitting the dataset to train and test in 80% and 20%

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.text, df.tags, test_size=0.2, random_state=9)

# Converting text and tags to vectors

In [11]:
# Used ti-idf , bow

# importing TfidfVectorizer , CountVectorizer from sklearn.feature_extraction
# to convert the text and tags to vectors
# so that we can train and test the dataset

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tfvectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2",
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_multilabel = tfvectorizer.fit_transform(x_train)
x_test_multilabel = tfvectorizer.transform(x_test)

vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary=True)
y_train_multilabel = vectorizer.fit_transform(y_train)
y_test_multilabel = vectorizer.transform(y_test)

# Traning Using One vs Rest

In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

classifier = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='liblinear'))
classifier.fit(x_train_multilabel, y_train_multilabel)
predictions = classifier.predict(x_test_multilabel)

accuracy = metrics.accuracy_score(y_test_multilabel, predictions)
macro_f1_score = metrics.f1_score(y_test_multilabel, predictions, average='macro')
micro_f1_score = metrics.f1_score(y_test_multilabel, predictions, average='micro')
hamming_loss = metrics.hamming_loss(y_test_multilabel, predictions)

print("accuracy:", accuracy)
print("macro f1 score:", macro_f1_score)
print("micro f1 score:", micro_f1_score)
print("hamming loss:", hamming_loss)

accuracy: 0.2834
macro f1 score: 0.3844907670603581
micro f1 score: 0.5823199799280757
hamming loss: 0.0124855


# Classification reports

In [16]:
# classification report
print("Precision recall report :\n",metrics.classification_report(y_test_multilabel, predictions))

Precision recall report :
               precision    recall  f1-score   support

           0       0.77      0.40      0.52       327
           1       0.60      0.03      0.06        96
           2       0.95      0.47      0.63       555
           3       0.97      0.56      0.71       294
           4       0.83      0.06      0.11        83
           5       0.56      0.31      0.40       447
           6       0.90      0.34      0.50       950
           7       0.78      0.19      0.30       588
           8       0.86      0.54      0.66      3763
           9       0.94      0.39      0.55      1295
          10       0.42      0.05      0.09        96
          11       0.00      0.00      0.00       101
          12       0.98      0.59      0.73       140
          13       0.65      0.13      0.21       359
          14       0.76      0.48      0.59        87
          15       0.20      0.01      0.03       150
          16       0.50      0.11      0.19       132


In [17]:
# using dill saving the classification, tfvectorization nad vectorization 
# in three files and using them to predict the tags

import dill
model_data = 'model_data.sav'
tfidf_data = 'tfidf_data.sav'
bow_data = 'bow_data.sav'
dill.dump(classifier, open(model_data, 'wb'))
dill.dump(tfvectorizer, open(tfidf_data, 'wb'))
dill.dump(vectorizer, open(bow_data, 'wb'))