In [10]:
import pandas as pd
from nltk.tokenize import word_tokenize
from ast import literal_eval
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
import pickle

## Read Data

In [16]:
train_data = pd.read_csv('data/train.tsv', sep='\t')
validation_data = pd.read_csv('data/validation.tsv', sep='\t')

In [17]:
train_data.tags = train_data.tags.apply(literal_eval)
validation_data.tags = validation_data.tags.apply(literal_eval)

In [18]:
X_train, y_train = train_data.title.values, train_data.tags.values
X_val, y_val = validation_data.title.values, validation_data.tags.values

In [19]:
X_train[0]

'How to draw a stacked dotplot in R?'

In [20]:
def buildVocabularyY(data):
    tagFrequency = {}
    for y in data:
        for tag in y:
            tag = tag.lower()
            if tag in tagFrequency:
                tagFrequency[tag]+=1
            else:
                tagFrequency[tag]=1
    return tagFrequency

In [21]:
YVocabulary = buildVocabularyY(y_train)

In [22]:
tags_classes = sorted(YVocabulary.keys())
tags_classes[0:10]

['.net',
 'ajax',
 'algorithm',
 'android',
 'angularjs',
 'apache',
 'arrays',
 'asp.net',
 'asp.net-mvc',
 'c']

In [23]:
mlb = MultiLabelBinarizer(classes=tags_classes)
y_train = mlb.fit_transform(y_train)
y_val = mlb.transform(y_val)
pickle.dump(mlb, open("MultiLabelBinarizer.pickle", "wb"))


In [25]:
y_train[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [26]:
vec = CountVectorizer()
logisticRegessionClassifier = LogisticRegression(C=1.0, penalty='l2')
ovrclassifier = OneVsRestClassifier(logisticRegessionClassifier)
vec_clf = Pipeline([('vectorizer', vec), ('pac', ovrclassifier)])
vec_clf.fit(X_train,y_train)




Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...te=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [27]:
predicted = vec_clf.predict(X_val)

In [29]:
y_val_pred_inversed = mlb.inverse_transform(predicted)
y_val_inversed = mlb.inverse_transform(y_val)
for i in range(4):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	Why odbc_exec always fail?
True labels:	php,sql
Predicted labels:	


Title:	Access a base classes variable from within a child class
True labels:	javascript
Predicted labels:	


Title:	Content-Type "application/json" not required in rails
True labels:	ruby,ruby-on-rails
Predicted labels:	ruby-on-rails


Title:	Sessions in Sinatra: Used to Pass Variable
True labels:	ruby,session
Predicted labels:	




In [30]:
joblib.dump(vec_clf, 'classifier.pkl', compress=9)

['classifier.pkl']

In [31]:
test_data = pd.read_csv('data/test.tsv', sep='\t')
X_test= test_data.title.values

In [36]:
loadedClassifier = joblib.load('classifier.pkl')
mbl_loaded = pickle.load(open("MultiLabelBinarizer.pickle", "rb"))

In [50]:
question = "Getting error - type json does not exist - in Postgresql during rake db migrate";

In [51]:
predicted = loadedClassifier.predict([question])
y_test_pred_inversed = mbl_loaded.inverse_transform(predicted)
y_test_pred_inversed

[('json', 'ruby-on-rails')]

Title:	Why odbc_exec always fail?
Predicted labels:	


Title:	Access a base classes variable from within a child class
Predicted labels:	


Title:	Content-Type "application/json" not required in rails
Predicted labels:	ruby-on-rails


Title:	Sessions in Sinatra: Used to Pass Variable
Predicted labels:	


Title:	Getting error - type "json" does not exist - in Postgresql during rake db migrate
Predicted labels:	json,ruby-on-rails


Title:	library not found for.....?
Predicted labels:	


Title:	.csproj File - Programmatic adding/deleting files
Predicted labels:	c#


Title:	TypeError: makedirs() got an unexpected keyword argument 'exists_ok'
Predicted labels:	python


Title:	How to Pan a div using JQuery
Predicted labels:	javascript,jquery


Title:	Hibernate intermediate/advanced tutorials
Predicted labels:	hibernate,java


