In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, hstack
import itertools
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.metrics import coverage_error
from sklearn.svm import SVC
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.neural_network import MLPClassifier
import scipy.sparse
from sklearn.cluster import KMeans

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from bs4 import BeautifulSoup

#from sklearn.preprocessing import StandardScaler

from scipy.sparse import coo_matrix, hstack


In [117]:
#Dataset loading
dataset = pd.read_csv('Cleaned_Posts.csv', nrows = None, index_col=0)
dataset.shape

(49399, 10)

In [118]:
#Any np.nan ?
dataset.isnull().sum()

TText             0
TText_NEG         0
PText             0
TCode         10054
PCode          9627
TTitle            0
TTitle_NEG        0
Title             0
PTags             0
Tags              0
dtype: int64

In [119]:
#np.nan cleaning
dataset['TCode'] = dataset['TCode'].fillna('None')
dataset['PCode'] = dataset['PCode'].fillna('None')

In [120]:
#Any np.nan ?
dataset.isnull().sum()

TText         0
TText_NEG     0
PText         0
TCode         0
PCode         0
TTitle        0
TTitle_NEG    0
Title         0
PTags         0
Tags          0
dtype: int64

In [121]:
#Tags cleaning
dataset['PTags'] = dataset['Tags'].apply(lambda x: [tag.name for tag in BeautifulSoup(x, 'html.parser').find_all()])

In [122]:
df = pd.DataFrame(dataset['PTags'].tolist()).stack().value_counts()
df = df[df>50]
df.shape

(383,)

In [123]:
existintags = set(df.index)
existintags

{'ajax',
 'algorithm',
 'amazon-ec2',
 'amazon-s3',
 'amazon-web-services',
 'android',
 'android-fragments',
 'android-gradle',
 'android-intent',
 'android-layout',
 'android-recyclerview',
 'android-studio',
 'angular',
 'angular-cli',
 'angular-material',
 'angular5',
 'angularjs',
 'animation',
 'apache',
 'apache-kafka',
 'apache-spark',
 'apache-spark-sql',
 'api',
 'arraylist',
 'arrays',
 'asp.net',
 'asp.net-core',
 'asp.net-core-2.0',
 'asp.net-mvc',
 'asp.net-web-api',
 'assembly',
 'asynchronous',
 'audio',
 'authentication',
 'automation',
 'aws-lambda',
 'axios',
 'azure',
 'bash',
 'batch-file',
 'beautifulsoup',
 'boost',
 'bootstrap-4',
 'browser',
 'button',
 'c',
 'c#',
 'c++',
 'c++11',
 'caching',
 'canvas',
 'cassandra',
 'chart.js',
 'charts',
 'checkbox',
 'class',
 'cmake',
 'cmd',
 'codeigniter',
 'concurrency',
 'cookies',
 'cordova',
 'cors',
 'cron',
 'css',
 'css3',
 'csv',
 'curl',
 'd3.js',
 'database',
 'dataframe',
 'datagridview',
 'datatable',
 'dat

In [124]:
#Downsampling
dataset = dataset.sample(5000)
dataset.shape

(5000, 10)

In [125]:

class CustomLDA(BaseEstimator, TransformerMixin):

    def __init__(self, **params):        
        self.vect_text_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        self.vect_title_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        self.vect_code_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        
        self.lda_params = {'n_components':10, 'n_jobs':2, 'random_state':0}
        
        self.clf_params = {'ntopwords':5, 'ntopics':5}
        
        self.params = {**{'vect_text_'+k:v for k,v in self.vect_text_params.items()},
                       **{'vect_title_'+k:v for k,v in self.vect_title_params.items()},
                       **{'vect_code_'+k:v for k,v in self.vect_code_params.items()},
                       **{'lda_'+k:v for k,v in self.lda_params.items()},
                       **{'clf_'+k:v for k,v in self.clf_params.items()},
                       **params}
        
        self.update_params()

    def predict(self, X, y=None):
        ntopwords = self.clf_params['ntopwords']
        ntopics = self.clf_params['ntopics']
        W = self.transform(X)
        components = self.components_()
        feature_names = self.get_feature_names()

        toreturn = []
        
        for document in W:
            #select the most important topics
            docprediction = []
            for topic in document.argsort()[:-ntopics-1:-1]:
                tags = [feature_names[i] for i in components[topic].argsort()[:-ntopwords-1:-1]]
                docprediction.append(tags)
            toreturn.append(docprediction)
        return toreturn
    
    def fit(self, X, y=None):
        dftext = X['TText']
        dftitle = X['TTitle']
        dfcode = X['TCode']
        #Text preparation
        self.textcvect = CountVectorizer(tokenizer=None, vocabulary=None, **self.vect_text_params)
        text = self.textcvect.fit_transform(dftext)
        
        #Title preparation
        self.titlecvect = CountVectorizer(tokenizer=None, vocabulary=None, **self.vect_title_params)
        title = self.titlecvect.fit_transform(dftitle)
        
        #Code preparation
        self.codecvect = CountVectorizer(tokenizer=None, vocabulary=None, **self.vect_code_params)
        code = self.codecvect.fit_transform(dfcode)
        
        #LDA preparation
        self.lda = LatentDirichletAllocation(learning_method = 'batch', **self.lda_params)
        complete = scipy.sparse.hstack((text, title, code))
        self.lda.fit(complete)
        
        return self
    
    def transform(self, X, y=None):
        dftext = X['TText']
        dftitle = X['TTitle']
        dfcode = X['TCode']
        
        #Text preparation
        textvect = self.textcvect.transform(dftext)
        
        #Title preparation
        titlevect = self.titlecvect.transform(dftitle)
    
        #Code preparation
        codevect = self.codecvect.transform(dfcode)
        
        lda = self.lda.transform(scipy.sparse.hstack((textvect, titlevect, codevect)))
        
        return lda
    
    def get_feature_names(self):
        textfnames = self.textcvect.get_feature_names()
        titlefnames = self.titlecvect.get_feature_names()
        codefnames = self.codecvect.get_feature_names()
        toreturn = []
        toreturn.extend(textfnames)
        toreturn.extend(titlefnames)
        toreturn.extend(codefnames)
        
        return toreturn
    
    def components_(self):        
        return self.lda.components_
    
    def get_params(self, deep=True):
        return self.params
    
    def set_params(self, **params):
        self.params = {**self.params, **params}
        self.update_params()
        return self
        
    def update_params(self):
        
        self.vect_text_params = {k[10:]:v for k,v in self.params.items() if k.startswith('vect_text_')}
        self.vect_title_params = {k[11:]:v for k,v in self.params.items() if k.startswith('vect_title_')}
        self.vect_code_params = {k[10:]:v for k,v in self.params.items() if k.startswith('vect_code_')}
        
        self.lda_params = {k[4:]:v for k,v in self.params.items() if k.startswith('lda_')}
        
        self.clf_params = {k[4:]:v for k,v in self.params.items() if k.startswith('clf_')}

        return self

    def score(self, X=None, y=None):
        numcomponent = self.lda_params['n_components']
        self.nametopics()
        topicsscore = [1/len(x) if len(x) !=0 else 0 for x in self.topicsnamed.values()]
        return numcomponent * np.array(topicsscore).mean()
    
    def nametopics(self):
        self.topicsnamed = {}
        self.namedtopics = {}
        ntopwords = self.clf_params['ntopwords']
        ntopics = self.clf_params['ntopics']
        components = self.components_()
        feature_names = self.get_feature_names()
        
        for topicnum, topiccomposition in enumerate(components):
            tags = [feature_names[i] for i in topiccomposition.argsort()[:-ntopwords-1:-1]]
            self.topicsnamed[topicnum] = [tag for tag in tags if tag in existintags]
            for tag in tags:
                if tag in existintags:
                    if tag in self.namedtopics.keys():
                        temp = list(self.namedtopics[tag])
                        temp.append(topicnum)
                        self.namedtopics[tag] = set(temp)
                    else:
                        self.namedtopics[tag] = set([topicnum])
        
        return self


customLDA = CustomLDA()


In [126]:
params = {'clf_ntopics': 3,
 'clf_ntopwords': 3,
 'lda_n_components': 50,
 'lda_n_jobs': 2,
 'lda_random_state': 0,
 'vect_code_max_df': 1.0,
 'vect_code_max_features': 15000,
 'vect_code_min_df': 1,
 'vect_code_ngram_range': (1, 3),
 'vect_text_max_df': 1.0,
 'vect_text_max_features': 20000,
 'vect_text_min_df': 1,
 'vect_text_ngram_range': (1, 3),
 'vect_title_max_df': 1.0,
 'vect_title_max_features': 70000,
 'vect_title_min_df': 1,
 'vect_title_ngram_range': (1, 2)}

customLDA = CustomLDA(**params)
customLDA.fit(dataset[['TText', 'TTitle', 'TCode']])

CustomLDA(clf_ntopics=3, clf_ntopwords=3, lda_n_components=50, lda_n_jobs=2,
     lda_random_state=0, vect_code_max_df=1.0,
     vect_code_max_features=15000, vect_code_min_df=1,
     vect_code_ngram_range=(1, 3), vect_text_max_df=1.0,
     vect_text_max_features=20000, vect_text_min_df=1,
     vect_text_ngram_range=(1, 3), vect_title_max_df=1.0,
     vect_title_max_features=70000, vect_title_min_df=1,
     vect_title_ngram_range=(1, 2))

In [127]:
#[1/len(x) if len(x) !=0 else 0 for x in customLDA.topicsnamed.values()]
customLDA.score()

16.0

In [142]:
#gridsearch to optimize the parameters

params = {'clf_ntopics': 3,
 'clf_ntopwords': 3,
 'lda_n_components': 100,
 'lda_n_jobs': 2,
 'lda_random_state': 0,
 'vect_code_max_df': 0.8,
 'vect_code_max_features': 15000,
 'vect_code_min_df': 5,
 'vect_code_ngram_range': (1, 2),
 'vect_text_max_df': 0.8,
 'vect_text_max_features': 15000,
 'vect_text_min_df': 10,
 'vect_text_ngram_range': (1, 3),
 'vect_title_max_df': 0.8,
 'vect_title_max_features': 15000,
 'vect_title_min_df': 5,
 'vect_title_ngram_range': (1, 2)}

customLDA = CustomLDA(**params)

parameters = {'vect_title_min_df': [5], 'vect_title_max_df': [0.8,], 'vect_title_max_features': [15000, 20000], 'vect_code_ngram_range': [(1,2), (1,3)],}

clf = GridSearchCV(customLDA, parameters, cv=2, return_train_score=True, refit=True, verbose = 3)
clf.fit(dataset[['TText', 'TTitle', 'TCode']])

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] vect_code_ngram_range=(1, 2), vect_title_max_df=0.8, vect_title_max_features=15000, vect_title_min_df=5 
[CV]  vect_code_ngram_range=(1, 2), vect_title_max_df=0.8, vect_title_max_features=15000, vect_title_min_df=5, score=42.5, total=  33.8s
[CV] vect_code_ngram_range=(1, 2), vect_title_max_df=0.8, vect_title_max_features=15000, vect_title_min_df=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   33.8s remaining:    0.0s


[CV]  vect_code_ngram_range=(1, 2), vect_title_max_df=0.8, vect_title_max_features=15000, vect_title_min_df=5, score=38.33333333333333, total=  34.4s
[CV] vect_code_ngram_range=(1, 2), vect_title_max_df=0.8, vect_title_max_features=20000, vect_title_min_df=5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s


[CV]  vect_code_ngram_range=(1, 2), vect_title_max_df=0.8, vect_title_max_features=20000, vect_title_min_df=5, score=42.5, total=  33.5s
[CV] vect_code_ngram_range=(1, 2), vect_title_max_df=0.8, vect_title_max_features=20000, vect_title_min_df=5 
[CV]  vect_code_ngram_range=(1, 2), vect_title_max_df=0.8, vect_title_max_features=20000, vect_title_min_df=5, score=38.33333333333333, total=  35.5s
[CV] vect_code_ngram_range=(1, 3), vect_title_max_df=0.8, vect_title_max_features=15000, vect_title_min_df=5 
[CV]  vect_code_ngram_range=(1, 3), vect_title_max_df=0.8, vect_title_max_features=15000, vect_title_min_df=5, score=36.0, total=  37.2s
[CV] vect_code_ngram_range=(1, 3), vect_title_max_df=0.8, vect_title_max_features=15000, vect_title_min_df=5 
[CV]  vect_code_ngram_range=(1, 3), vect_title_max_df=0.8, vect_title_max_features=15000, vect_title_min_df=5, score=38.5, total=  38.7s
[CV] vect_code_ngram_range=(1, 3), vect_title_max_df=0.8, vect_title_max_features=20000, vect_title_min_df=5 

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.8min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=CustomLDA(clf_ntopics=3, clf_ntopwords=3, lda_n_components=100, lda_n_jobs=2,
     lda_random_state=0, vect_code_max_df=0.8,
     vect_code_max_features=15000, vect_code_min_df=5,
     vect_code_ngram_range=(1, 2), vect_text_max_df=0.8,
     vect_text_max_features=15000, vect_text_min_df=10,
     vect_text_ngram_range=(1, 3), vect_title_max_df=1.0,
     vect_title_max_features=70000, vect_title_min_df=1,
     vect_title_ngram_range=(1, 2)),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect_title_min_df': [5], 'vect_title_max_df': [0.8], 'vect_title_max_features': [15000, 20000], 'vect_code_ngram_range': [(1, 2), (1, 3)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [143]:
clf.best_params_

{'vect_code_ngram_range': (1, 2),
 'vect_title_max_df': 0.8,
 'vect_title_max_features': 15000,
 'vect_title_min_df': 5}

In [144]:
clf.best_score_

40.416666666666664

In [150]:
dataset.head(10)

Unnamed: 0_level_0,TText,TText_NEG,PText,TCode,PCode,TTitle,TTitle_NEG,Title,PTags,Tags
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
48096039,set send cooki via ajax store browser ask logi...,set send cooki via ajax store browser ask_NEG ...,I can set and send cookie via ajax which is s...,,,set send cooki via ajax,set send cooki via ajax,Set and Send cookie via ajax,"[ajax, curl, cookies]",<ajax><curl><cookies>
48089460,use api sync tabl data need queri data need ca...,use api sync tabl data need queri data need ca...,For now I am using the api to sync the tables...,,,configur kylin synchron tabl period,configur kylin synchron tabl period,Can we configure Kylin to synchronize all the ...,[kylin],<kylin>
48091983,'m use codeignit php framework ion_auth authen...,'m use codeignit php framework ion_auth authen...,I'm using Codeigniter as my PHP framework and...,,,implement share databas multi tenant applic php,implement share databas multi tenant applic php,Implementing shared database for multi tenant ...,"[php, mysql, codeigniter, saas]",<php><mysql><codeigniter><saas>
48077316,tri run code octav written basic matlab one fu...,tri run code octav written basic matlab one fu...,I am trying to run a code in Octave written b...,error :' sylvester ' undefined near line column,error: 'sylvester' undefined near line 14 colu...,sylvest function octav,sylvest function octav,Sylvester function in Octave,"[matlab, octave]",<matlab><octave>
48092975,use laravel 's artisan command creat databas t...,use laravel 's artisan command creat databas t...,I am using laravel's artisan command to creat...,php artisan migrate table_1 table_1_table_1_si...,php artisan migrate table_1 table_1_table_1_si...,laravel migrat error creat tabl foreign key,laravel migrat error creat tabl foreign key,Laravel migration error on creating table with...,"[php, mysql, laravel]",<php><mysql><laravel>
48144183,'m tri find id object insid array object _id f...,'m tri find id object insid array object _id f...,I'm trying to find the id of an object inside...,var CardSchema = new mongoose . Schema ({ beNa...,var CardSchema = new mongoose.Schema({\n beNa...,find _id array object mongodb databas,find _id array object mongodb databas,Find by _id on an array of objects in mongodb ...,"[mongodb, mongoose, mongoose-schema]",<mongodb><mongoose><mongoose-schema>
48186237,use angular want bind complet json object sele...,use angular want bind complet json object sele...,I am using with in Angular 5. I want to bin...,"< select class ="" w3 - input dropDown form - c...","<select class=""w3-input dropDown form-control""...",two way data bind work select,two way data bind work_NEG select_NEG,Two way data binding not working for <select>,"[typescript, angular5]",<typescript><angular5>
48051220,make 3d game player rotat view point via mous ...,make 3d game player rotat view point via mous ...,I am making a 3D game in which the player can...,private static void changeRotation () { angle ...,private static void changeRotation(){\n ang...,3d rotat ratio via mous placement screen java,3d rotat ratio via mous placement screen java,3D rotation ratios via mouse placement on scre...,"[java, rotation, angle]",<java><3d><rotation><angle>
48168060,current tri write xml file use path `` system....,current tri write xml file use path `` system....,"I am currently trying to write an XML file, w...","< bean class ="" org . springframework . beans ...","<bean class=""org.springframework.beans.factory...",spring 2.0 xml propertyplaceholderconfigur sys...,spring 2.0 xml propertyplaceholderconfigur sys...,Spring 2.0 XML PropertyPlaceholderConfigurer S...,"[java, xml, spring, properties, system]",<java><xml><spring><properties><system>
48150752,android studio editor xml,android studio editor xml,and this is what is in the android studio edi...,"<? xml version ="" . "" encoding ="" utf - ""? >< ...","<?xml version=""1.0"" encoding=""utf-8""?>\n<Relat...",android 3.0.1 editor render devic,android 3.0.1 editor render_NEG devic_NEG,Android 3.0.1 editor not the same with what is...,"[android, xml, android-studio]",<android><xml><android-studio>


In [165]:
article = [48168060, 48150752]
samplepost = dataset[['TText', 'TTitle', 'TCode']].loc[article]
print(dataset["PTags"].loc[article])
np.array(clf.predict(samplepost))

Id
48168060    [java, xml, spring, properties, system]
48150752             [android, xml, android-studio]
Name: PTags, dtype: object


array([[['use', 'like', 'queri'],
        ['folder', 'creat', 'way'],
        ['file', 'project', 'build']],

       [['android', 'id', 'app'],
        ['script', 'li', 'html'],
        ['char', 'unsigned', 'unsigned char']]],
      dtype='<U13')

In [197]:
dataset[['TText', 'TTitle', 'TCode', 'PTags']]

Unnamed: 0_level_0,TText,TTitle,TCode,PTags
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48053160,look around n't found someth help arduino uno ...,ca n't get data mysql databas arduino variabl,"<? php $ db_name ="" aquarium ""; $ user ="" root...","[php, mysql, http, get, arduino]"
48164249,environ python 3.6 ananconda 5.0.1,ca n't download newsgroup data via python code,,[scikit-learn]
48116439,'s scenario got two entiti three type modul ar...,unabl rest put oper onetomani relat spring boot,@ Entity @ Data class Project {@ NotEmpty priv...,"[spring, spring-data-rest, spring-restcontroll..."
48166535,whmcs version drop contain list client get cli...,databas capsul manag work,jQuery ( document ). ready ( function ($ ){ $(...,"[ajax, whmcs]"
48143597,question regard memori access applic run compu...,access memori app without run admin privileg,,"[memory, memory-management, access, memory-acc..."
48185162,given pseudo data -train data implement random...,groupbi model random forest algorithm get sing...,"rf = randomForest ( Default ~. , data = traind...","[r, machine-learning, random-forest, data-scie..."
48170117,vba experi hope way without use macro program ...,record api ticker data excel tabl,,"[excel, api, store, record, ticker]"
48178451,'m work exercis 14.2-4 clrs intro algorithm 3e...,asymptot run time print key red-black tree fal...,"RB - ENUMERATE ( x , a , b ) T = red - black t...","[algorithm, time-complexity, binary-tree, bina..."
48077459,code json data arraylist 'm take one parent he...,pass one json two differ data expand listview,timez _listDataHeader _listdataChild LinearLay...,"[android, expandablelistview]"
48151565,'m tri display imag folder imag doe exist want...,skip imag doe n't exist carousel,"< div class ="" w3 - content w3 - section "" sty...","[javascript, html]"


In [110]:
kmeans = KMeans(n_clusters=500, random_state=0).fit(X)

In [115]:
mask = kmeans.labels_ == 45
popularitydict = {}
for tags in list(dataset['PTags'][mask]):
    for tag in tags:
        if tag in popularitydict.keys():
            popularitydict[tag] += 1
        else:
            popularitydict[tag] = 1
df = pd.DataFrame(list(popularitydict.items()))
df.columns = ('Tag', 'Count')
df.set_index('Tag', inplace = True)
df.sort_values('Count', ascending=False)

Unnamed: 0_level_0,Count
Tag,Unnamed: 1_level_1
python,3
android,2
javascript,2
email,2
if-statement,1
r,1
physics,1
gmail,1
automation,1
mongodb,1


In [112]:
for index, text in dataset[['PText', 'Title']][mask].iterrows():
    print(text['Title'])
    print()
    print(text['PText'])
    print('\n\n')

Twilio - Using taskrouter.js and reservation.conference() how to not beep and end conference

 I am using task router to assign an incoming call task to a worker. When the worker gets the reservation I am starting a conference like this: There is not much documentation for how to handle a conference with taskrouter.js, but this seems to work to start the conference. There are 2 problems I am having: I can't stop the 'entering conference' beep to not play When both the worker and participant exit the conference the conference is not actually ended and therefore not putting the worker into the after work activity state. Any help would be appreciated.



How to set multiple scroll views size to change dynamically relative to each other?

 I have two scroll views in a vertical linear layout.
I want them to be relative to each other so that they fill the entire linear layout and compensate if one cant cover half the screen. Lets call that scroll views TOP and BOT.
If the screen can display 

In [75]:
CV= CountVectorizer(tokenizer=None, vocabulary=None)
bw = CV.fit_transform(dataset['TText'])
LDA = LatentDirichletAllocation(learning_method = 'batch')
mat = LDA.fit(bw)

In [77]:
LDA.components_.shape

(10, 3886)

In [81]:
len(CV.get_feature_names())

3886

In [87]:
temp = CV.transform(dataset['TText'].iloc[[4]])
temp.shape

(1, 3886)

In [90]:
ldatex = LDA.transform(temp)
ldatex.shape

(1, 10)

In [91]:
LDA.inverse_transform(ldatex)

AttributeError: 'LatentDirichletAllocation' object has no attribute 'inverse_transform'

In [158]:

class CustomNMF(BaseEstimator, TransformerMixin):

    def __init__(self, **params):        
        self.vect_text_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        self.vect_title_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        self.vect_code_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        
        self.nmf_params = {'n_components':10, 'random_state':0, 'init':'nndsvd', 'alpha':0, 'l1_ratio':0}
        
        self.clf_params = {'ntopwords':5, 'ntopics':5}
        
        self.params = {**{'vect_text_'+k:v for k,v in self.vect_text_params.items()},
                       **{'vect_title_'+k:v for k,v in self.vect_title_params.items()},
                       **{'vect_code_'+k:v for k,v in self.vect_code_params.items()},
                       **{'nmf_'+k:v for k,v in self.nmf_params.items()},
                       **{'clf_'+k:v for k,v in self.clf_params.items()},
                       **params}
        
        self.update_params()

    def predict(self, X, y=None):
        ntopwords = self.clf_params['ntopwords']
        ntopics = self.clf_params['ntopics']
        W = self.transform(X)
        components = self.components_()
        feature_names = self.get_feature_names()

        toreturn = []
        
        for document in W:
            #select the most important topics
            docprediction = []
            for topic in document.argsort()[:-ntopics-1:-1]:
                tags = [feature_names[i] for i in components[topic].argsort()[:-ntopwords-1:-1]]
                docprediction.append(tags)
            toreturn.append(docprediction)
        return toreturn
    
    def fit(self, X, y=None):
        dftext = X['TText']
        dftitle = X['TTitle']
        dfcode = X['TCode']
        #Text preparation
        self.textcvect = TfidfVectorizer(tokenizer=None, vocabulary=None, **self.vect_text_params)
        text = self.textcvect.fit_transform(dftext)
        
        #Title preparation
        self.titlecvect = TfidfVectorizer(tokenizer=None, vocabulary=None, **self.vect_title_params)
        title = self.titlecvect.fit_transform(dftitle)
        
        #Code preparation
        self.codecvect = TfidfVectorizer(tokenizer=None, vocabulary=None, **self.vect_code_params)
        code = self.codecvect.fit_transform(dfcode)
        
        #LDA preparation
        self.nmf = NMF(**self.nmf_params)
        complete = scipy.sparse.hstack((text, title, code))
        self.nmf.fit(complete)
        
        return self
    
    def transform(self, X, y=None):
        dftext = X['TText']
        dftitle = X['TTitle']
        dfcode = X['TCode']
        
        #Text preparation
        textvect = self.textcvect.transform(dftext)
        
        #Title preparation
        titlevect = self.titlecvect.transform(dftitle)
    
        #Code preparation
        codevect = self.codecvect.transform(dfcode)
        
        nmf = self.nmf.transform(scipy.sparse.hstack((textvect, titlevect, codevect)))
        
        return nmf
    
    def get_feature_names(self):
        textfnames = self.textcvect.get_feature_names()
        titlefnames = self.titlecvect.get_feature_names()
        codefnames = self.codecvect.get_feature_names()
        toreturn = []
        toreturn.extend(textfnames)
        toreturn.extend(titlefnames)
        toreturn.extend(codefnames)
        
        return toreturn
    
    def components_(self):        
        return self.nmf.components_
    
    def get_params(self, deep=True):
        return self.params
    
    def set_params(self, **params):
        self.params = {**self.params, **params}
        self.update_params()
        return self
        
    def update_params(self):
        
        self.vect_text_params = {k[10:]:v for k,v in self.params.items() if k.startswith('vect_text_')}
        self.vect_title_params = {k[11:]:v for k,v in self.params.items() if k.startswith('vect_title_')}
        self.vect_code_params = {k[10:]:v for k,v in self.params.items() if k.startswith('vect_code_')}
        
        self.nmf_params = {k[4:]:v for k,v in self.params.items() if k.startswith('nmf_')}
        
        self.clf_params = {k[4:]:v for k,v in self.params.items() if k.startswith('clf_')}

        return self

    def score(self, X=None, y=None):
        numcomponent = self.nmf_params['n_components']
        self.nametopics()
        topicsscore = [1/len(x) if len(x) !=0 else 0 for x in self.topicsnamed.values()]
        return numcomponent * np.array(topicsscore).mean()
    
    def nametopics(self):
        self.topicsnamed = {}
        self.namedtopics = {}
        ntopwords = self.clf_params['ntopwords']
        ntopics = self.clf_params['ntopics']
        components = self.components_()
        feature_names = self.get_feature_names()
        
        for topicnum, topiccomposition in enumerate(components):
            tags = [feature_names[i] for i in topiccomposition.argsort()[:-ntopwords-1:-1]]
            self.topicsnamed[topicnum] = [tag for tag in tags if tag in existintags]
            for tag in tags:
                if tag in existintags:
                    if tag in self.namedtopics.keys():
                        temp = list(self.namedtopics[tag])
                        temp.append(topicnum)
                        self.namedtopics[tag] = set(temp)
                    else:
                        self.namedtopics[tag] = set([topicnum])
        
        return self


customNMF = CustomNMF()


In [159]:
params = {'clf_ntopics': 3,
 'clf_ntopwords': 3,
 'nmf_n_components': 100,
 'nmf_random_state': 0,
 'vect_code_max_df': 0.8,
 'vect_code_max_features': 15000,
 'vect_code_min_df': 5,
 'vect_code_ngram_range': (1, 2),
 'vect_text_max_df': 0.8,
 'vect_text_max_features': 15000,
 'vect_text_min_df': 10,
 'vect_text_ngram_range': (1, 3),
 'vect_title_max_df': 0.8,
 'vect_title_max_features': 15000,
 'vect_title_min_df': 5,
 'vect_title_ngram_range': (1, 2)}

customNMF = CustomNMF(**params)
customNMF.fit(dataset[['TText', 'TTitle', 'TCode']])

CustomNMF(clf_ntopics=3, clf_ntopwords=3, nmf_alpha=0, nmf_init='nndsvd',
     nmf_l1_ratio=0, nmf_n_components=100, nmf_random_state=0,
     vect_code_max_df=0.8, vect_code_max_features=15000,
     vect_code_min_df=5, vect_code_ngram_range=(1, 2),
     vect_text_max_df=0.8, vect_text_max_features=15000,
     vect_text_min_df=10, vect_text_ngram_range=(1, 3),
     vect_title_max_df=0.8, vect_title_max_features=15000,
     vect_title_min_df=5, vect_title_ngram_range=(1, 2))

In [164]:
customNMF.score()
#customNMF.namedtopics

23.333333333333332

In [168]:
article = [48168060, 48150752]
samplepost = dataset[['TText', 'TTitle', 'TCode']].loc[article]
print(dataset["PTags"].loc[article])
np.array(customNMF.predict(samplepost))

Id
48168060    [java, xml, spring, properties, system]
48150752             [android, xml, android-studio]
Name: PTags, dtype: object


array([[['spring', 'spring', 'boot', 'spring boot', 'boot'],
        ['user', 'user', 'user', 'login', 'authent'],
        ['field', 'field', 'field', 'input field', 'xml']],

       [['android', 'android', 'devic', 'android studio', 'devic'],
        ['android', 'layout', 'xml', 'layout', 'layout_width'],
        ['visual', 'studio', 'visual studio', 'visual', 'studio']]],
      dtype='<U14')

In [167]:
customNMF.set_params(clf_ntopwords=5)

CustomNMF(clf_ntopics=3, clf_ntopwords=5, nmf_alpha=0, nmf_init='nndsvd',
     nmf_l1_ratio=0, nmf_n_components=100, nmf_random_state=0,
     vect_code_max_df=0.8, vect_code_max_features=15000,
     vect_code_min_df=5, vect_code_ngram_range=(1, 2),
     vect_text_max_df=0.8, vect_text_max_features=15000,
     vect_text_min_df=10, vect_text_ngram_range=(1, 3),
     vect_title_max_df=0.8, vect_title_max_features=15000,
     vect_title_min_df=5, vect_title_ngram_range=(1, 2))