In [13]:
import importlib
import os
from joblib import dump, load

import numpy as np
import pandas as pd

import ml.explo as mlexplo
import ml.prepare as mlprepare

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV

from sklearn import metrics

data_path = "..\\data\\stackoverflow\\"
model_path = data_path + "models\\"
np.set_printoptions(precision=2, suppress=True)
pd.options.display.float_format = "{:,.2f}".format

In [3]:
y_dict = load(data_path + 'top50_y_dict.joblib')
title_vectorizer = load(model_path + 'top50_title_vectorizer.joblib')
body_vectorizer = load(model_path + 'top50_body_vectorizer.joblib')

In [4]:
model = load(data_path + 'baseline_python.joblib')

In [6]:
txt = 'I need help avoiding dict linesplit '
tv = body_vectorizer.transform([txt])
print ("Reverse : ", body_vectorizer.inverse_transform(tv))
# print ('Should I tag python?', clf.predict(tv))
print ('Should I tag python?', model.predict(tv))


Reverse :  [array(['need', 'help', 'dict'], dtype='<U674')]
Should I tag python? [1]


In [14]:
title_corpus = load(data_path + 'top50_title_corpus.joblib')
X_title = title_vectorizer.fit_transform(title_corpus)

body_corpus = load(data_path + 'top50_body_corpus.joblib')
X_body = body_vectorizer.fit_transform(body_corpus)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_body, y_dict['python'], random_state=42, test_size=0.2, shuffle=True)
params = {
    'alpha': np.arange(10)*.1+.1,
    'max_iter': [250],
    }
clf = GridSearchCV(
    RidgeClassifier(),
    params,
    n_jobs=6)
clf.fit(X_train, y_train)
clf.best_estimator_

RidgeClassifier(alpha=0.6, max_iter=250)

In [9]:
clf.best_estimator_.score(X_test, y_test)

0.8822

In [19]:
dump(clf.best_estimator_, data_path + 'baseline_python.joblib')

['..\\data\\stackoverflow\\baseline_python.joblib']

[array(['on', 'need', 'help', 'dict'], dtype='<U674')]

In [19]:
params = {
    'alpha': np.arange(10)*.1+.1,
    'max_iter': [250],
    }
clf = GridSearchCV(
    RidgeClassifier(),
    params,
    n_jobs=6)

baseline =  dict({})

for k in y_dict.keys():
    print("classifying:", k)
    X_train, X_test, y_train, y_test = train_test_split(X_body, y_dict[k], random_state=42, test_size=0.2, shuffle=True)
#     X_train, X_test, y_train, y_test = train_test_split(X_title, y_dict[k], random_state=42, test_size=0.2, shuffle=True)
#     clf = RidgeClassifier().fit(X_train, y_train)
    clf = GridSearchCV(
        RidgeClassifier(),
        params,
        n_jobs=6)
    clf.fit(X_train, y_train)
    baseline[k] = clf.best_estimator_
    print("GS best:", clf.best_estimator_)
    print("clf score: ", clf.best_estimator_.score(X_test, y_test))
    pred = clf.best_estimator_.predict(X_test)
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))
    print("=========================")
dump(baseline, model_path + 'top50_body_RidgeC_baseline.joblib')

classifying: python
GS best: RidgeClassifier(alpha=0.6, max_iter=250)
clf score:  0.8822
accuracy:   0.882
confusion matrix:
[[7996  200]
 [ 978  826]]
classifying: other
GS best: RidgeClassifier(max_iter=250)
clf score:  0.8802
accuracy:   0.880
confusion matrix:
[[8498  125]
 [1073  304]]
classifying: javascript
GS best: RidgeClassifier(alpha=0.5, max_iter=250)
clf score:  0.9093
accuracy:   0.909
confusion matrix:
[[8714  143]
 [ 764  379]]
classifying: r
GS best: RidgeClassifier(alpha=0.4, max_iter=250)
clf score:  0.9472
accuracy:   0.947
confusion matrix:
[[9148   43]
 [ 485  324]]
classifying: pandas
GS best: RidgeClassifier(alpha=0.6, max_iter=250)
clf score:  0.963
accuracy:   0.963
confusion matrix:
[[9311   74]
 [ 296  319]]
classifying: java
GS best: RidgeClassifier(alpha=0.30000000000000004, max_iter=250)
clf score:  0.9541
accuracy:   0.954
confusion matrix:
[[9362   57]
 [ 402  179]]
classifying: c++
GS best: RidgeClassifier(alpha=0.30000000000000004, max_iter=250)
clf s

['..\\data\\stackoverflow\\models\\top50_body_RidgeC_baseline.joblib']

In [27]:
title_vectorizer = TfidfVectorizer(ngram_range=(1,1))
body_vectorizer = TfidfVectorizer(ngram_range=(1,1))
# vectorizer = TfidfVectorizer(ngram_range=(2,2))
X_title = title_vectorizer.fit_transform(corpus_title)
X_body = body_vectorizer.fit_transform(corpus_body)
print(vectorizer.get_feature_names())
print(X_title.toarray().shape)

['assign', 'atom', 'base', 'caus', 'code', 'cradl', 'cran', 'date', 'declar', 'domnodeinsert', 'element', 'expand', 'faster', 'filter', 'fix', 'flicker', 'function', 'get', 'hoist', 'import', 'input', 'insid', 'issu', 'jqueri', 'name', 'newton', 'packag', 'panda', 'put', 'python3', 'react', 'return', 'row', 'select', 'specif', 'sql', 'statement', 'subqueri', 'troubl', 'type', 'use', 'valu', 'vector', 'work', 'wrap']
(100, 382)


In [30]:
clf = RidgeClassifier().fit(X_title, y_dict['javascript'])

In [33]:
clf.score(X_title, y_dict['javascript'])

1.0

In [26]:
df = df_only.copy(deep=True)
corpus_title = [' '.join(map(str, w)) for w in df['Title'].tolist()]
corpus_body = [' '.join(map(str, w)) for w in df['Body'].tolist()]
corpus_tags = [' '.join(map(str, w)) for w in df['Tags'].tolist()]

In [29]:
set(corpus_title)

{'sqlalchemi sum true',
 'make templat alia templat class',
 'truncat style tag data string',
 'problem na spline use data frame tempor interpol r',
 'sum json array item common key',
 'panda duplic row specif valu column chang column valu duplic',
 'creat function return list factor appli function valu vector r',
 'clean split follow string',
 'panda assign group number time bin',
 'guarante load complet store occur',
 'tri make simpl updat sql error ora 01779 cannot modifi column map non key preserv tabl',
 'django convert valu two clumn absolut valu',
 'use generic method paramet java',
 'javascript destructur object contain field special charact',
 'remov duplic row panda possibl group',
 'creat function r compar row',
 'autofil sourc keystor password import pkcs12 keystor',
 'fix angl mathemat triangl squar',
 'use groupbi calcul cum sum panda datafram',
 'plot creat custom hover label plot 3d scatter figur',
 'paramiko junip echo command execut',
 'send raw xml request soap servi