In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, hstack
import itertools
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.metrics import coverage_error
from sklearn.svm import SVC
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.neural_network import MLPClassifier
import scipy.sparse
from sklearn.cluster import KMeans

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from bs4 import BeautifulSoup

#from sklearn.preprocessing import StandardScaler

from scipy.sparse import coo_matrix, hstack


In [155]:
#Dataset loading
dataset = pd.read_csv('Cleaned_Posts.csv', nrows = None, index_col=0)
dataset.shape

(49399, 10)

In [156]:
#Any np.nan ?
dataset.isnull().sum()

TText             0
TText_NEG         0
PText             0
TCode         10054
PCode          9627
TTitle            0
TTitle_NEG        0
Title             0
PTags             0
Tags              0
dtype: int64

In [157]:
#np.nan cleaning
dataset['TCode'] = dataset['TCode'].fillna('None')
dataset['PCode'] = dataset['PCode'].fillna('None')

In [158]:
#Any np.nan ?
dataset.isnull().sum()

TText         0
TText_NEG     0
PText         0
TCode         0
PCode         0
TTitle        0
TTitle_NEG    0
Title         0
PTags         0
Tags          0
dtype: int64

In [165]:
#Downsampling
dataset = dataset.sample(10000)
dataset.shape

(10000, 10)

In [166]:
#Tags cleaning
dataset['PTags'] = dataset['Tags'].apply(lambda x: [tag.name for tag in BeautifulSoup(x, 'html.parser').find_all()])

In [168]:
df = pd.DataFrame(dataset['PTags'].tolist()).stack().value_counts()
df[df>50]

javascript           1091
python                873
java                  747
android               631
c#                    545
php                   533
html                  444
jquery                347
ios                   330
angular               324
css                   318
node.js               294
c++                   272
mysql                 253
r                     231
sql                   227
swift                 226
reactjs               220
python-3.x            193
sql-server            155
firebase              149
json                  146
django                146
laravel               141
arrays                139
angularjs             138
spring                135
typescript            133
excel                 133
asp.net               129
                     ... 
tensorflow             88
postgresql             87
asp.net-mvc            86
xml                    82
excel-vba              82
linux                  79
ruby                   78
windows     

In [162]:

class CustomLDA(BaseEstimator, TransformerMixin):

    def __init__(self, **params):        
        self.vect_text_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        self.vect_title_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        self.vect_code_params = {'ngram_range':(1,1), 'max_df':1., 'min_df':1, 'max_features':None}
        
        self.lda_params = {'n_components':10, 'n_jobs':2, 'random_state':0}
        
        self.clf_params = {'ntopwords':5, 'ntopics':5}
        
        self.params = {**{'vect_text_'+k:v for k,v in self.vect_text_params.items()},
                       **{'vect_title_'+k:v for k,v in self.vect_title_params.items()},
                       **{'vect_code_'+k:v for k,v in self.vect_code_params.items()},
                       **{'lda_'+k:v for k,v in self.lda_params.items()},
                       **{'clf_'+k:v for k,v in self.clf_params.items()},
                       **params}
        
        self.update_params()

    def predict(self, X, y=None):
        ntopwords = self.clf_params['ntopwords']
        ntopics = self.clf_params['ntopics']
        W = self.transform(X)
        components = self.components_()
        feature_names = self.get_feature_names()

        toreturn = []
        
        for document in W:
            #select the most important topics
            docprediction = []
            for topic in document.argsort()[:-ntopics-1:-1]:
                tags = [feature_names[i] for i in components[topic].argsort()[:-ntopwords-1:-1]]
                docprediction.append(tags)
            toreturn.append(docprediction)
        return toreturn
    
    def fit(self, X, y=None):
        dftext = X['TText']
        dftitle = X['TTitle']
        dfcode = X['TCode']
        #Text preparation
        self.textcvect = CountVectorizer(tokenizer=None, vocabulary=None, **self.vect_text_params)
        text = self.textcvect.fit_transform(dftext)
        
        #Title preparation
        self.titlecvect = CountVectorizer(tokenizer=None, vocabulary=None, **self.vect_title_params)
        title = self.titlecvect.fit_transform(dftitle)
        
        #Code preparation
        self.codecvect = CountVectorizer(tokenizer=None, vocabulary=None, **self.vect_code_params)
        code = self.codecvect.fit_transform(dfcode)
        
        #LDA preparation
        self.lda = LatentDirichletAllocation(learning_method = 'batch', **self.lda_params)
        complete = scipy.sparse.hstack((text, title, code))
        self.lda.fit(complete)
        
        return self
    
    def transform(self, X, y=None):
        dftext = X['TText']
        dftitle = X['TTitle']
        dfcode = X['TCode']
        
        #Text preparation
        textvect = self.textcvect.transform(dftext)
        
        #Title preparation
        titlevect = self.titlecvect.transform(dftitle)
    
        #Code preparation
        codevect = self.codecvect.transform(dfcode)
        
        lda = self.lda.transform(scipy.sparse.hstack((textvect, titlevect, codevect)))
        
        return lda
    
    def get_feature_names(self):
        textfnames = self.textcvect.get_feature_names()
        titlefnames = self.titlecvect.get_feature_names()
        codefnames = self.codecvect.get_feature_names()
        toreturn = []
        toreturn.extend(textfnames)
        toreturn.extend(titlefnames)
        toreturn.extend(codefnames)
        
        return toreturn
    
    def components_(self):        
        return self.lda.components_
    
    def get_params(self, deep=True):
        return self.params
    
    def set_params(self, **params):
        self.params = {**self.params, **params}
        self.update_params()
        return self
        
    def update_params(self):
        
        self.vect_text_params = {k[10:]:v for k,v in self.params.items() if k.startswith('vect_text_')}
        self.vect_title_params = {k[11:]:v for k,v in self.params.items() if k.startswith('vect_title_')}
        self.vect_code_params = {k[10:]:v for k,v in self.params.items() if k.startswith('vect_code_')}
        
        self.lda_params = {k[4:]:v for k,v in self.params.items() if k.startswith('lda_')}
        
        self.clf_params = {k[4:]:v for k,v in self.params.items() if k.startswith('clf_')}

        return self


customLDA = CustomLDA()


In [189]:
params = {'clf_ntopics': 3,
 'clf_ntopwords': 3,
 'lda_n_components': 100,
 'lda_n_jobs': 2,
 'lda_random_state': 0,
 'vect_code_max_df': 1.0,
 'vect_code_max_features': 15000,
 'vect_code_min_df': 1,
 'vect_code_ngram_range': (1, 3),
 'vect_text_max_df': 1.0,
 'vect_text_max_features': 20000,
 'vect_text_min_df': 1,
 'vect_text_ngram_range': (1, 3),
 'vect_title_max_df': 1.0,
 'vect_title_max_features': 70000,
 'vect_title_min_df': 1,
 'vect_title_ngram_range': (1, 2)}

customLDA = CustomLDA(**params)
customLDA.fit(dataset[['TText', 'TTitle', 'TCode']])

CustomLDA(clf_ntopics=3, clf_ntopwords=3, lda_n_components=100, lda_n_jobs=2,
     lda_random_state=0, vect_code_max_df=1.0,
     vect_code_max_features=15000, vect_code_min_df=1,
     vect_code_ngram_range=(1, 3), vect_text_max_df=1.0,
     vect_text_max_features=20000, vect_text_min_df=1,
     vect_text_ngram_range=(1, 3), vect_title_max_df=1.0,
     vect_title_max_features=70000, vect_title_min_df=1,
     vect_title_ngram_range=(1, 2))

In [199]:
article = [48118909]
samplepost = dataset[['TText', 'TTitle', 'TCode']].loc[article]

np.array(customLDA.predict(samplepost))

array([[['div', 'class', 'div div'],
        ['class', 'div', 'li'],
        ['like', 'list', 'use']]],
      dtype='<U7')

In [200]:
print(dataset["PTags"].loc[article])

Id
48118909    [html, css, hover]
Name: PTags, dtype: object


In [201]:
params = {'clf_ntopics': 3,
 'clf_ntopwords': 3,
 'lda_n_components': 100,
 'lda_n_jobs': 2,
 'lda_random_state': 0,
 'vect_code_max_df': 1.0,
 'vect_code_max_features': 15000,
 'vect_code_min_df': 1,
 'vect_code_ngram_range': (1, 3),
 'vect_text_max_df': 1.0,
 'vect_text_max_features': 20000,
 'vect_text_min_df': 1,
 'vect_text_ngram_range': (1, 3),
 'vect_title_max_df': 1.0,
 'vect_title_max_features': 70000,
 'vect_title_min_df': 1,
 'vect_title_ngram_range': (1, 2)}

{'clf_ntopics': 3,
 'clf_ntopwords': 3,
 'lda_n_components': 100,
 'lda_n_jobs': 2,
 'lda_random_state': 0,
 'vect_code_max_df': 1.0,
 'vect_code_max_features': 15000,
 'vect_code_min_df': 1,
 'vect_code_ngram_range': (1, 3),
 'vect_text_max_df': 1.0,
 'vect_text_max_features': 20000,
 'vect_text_min_df': 1,
 'vect_text_ngram_range': (1, 3),
 'vect_title_max_df': 1.0,
 'vect_title_max_features': 70000,
 'vect_title_min_df': 1,
 'vect_title_ngram_range': (1, 2)}

In [197]:
dataset[['TText', 'TTitle', 'TCode', 'PTags']]

Unnamed: 0_level_0,TText,TTitle,TCode,PTags
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48053160,look around n't found someth help arduino uno ...,ca n't get data mysql databas arduino variabl,"<? php $ db_name ="" aquarium ""; $ user ="" root...","[php, mysql, http, get, arduino]"
48164249,environ python 3.6 ananconda 5.0.1,ca n't download newsgroup data via python code,,[scikit-learn]
48116439,'s scenario got two entiti three type modul ar...,unabl rest put oper onetomani relat spring boot,@ Entity @ Data class Project {@ NotEmpty priv...,"[spring, spring-data-rest, spring-restcontroll..."
48166535,whmcs version drop contain list client get cli...,databas capsul manag work,jQuery ( document ). ready ( function ($ ){ $(...,"[ajax, whmcs]"
48143597,question regard memori access applic run compu...,access memori app without run admin privileg,,"[memory, memory-management, access, memory-acc..."
48185162,given pseudo data -train data implement random...,groupbi model random forest algorithm get sing...,"rf = randomForest ( Default ~. , data = traind...","[r, machine-learning, random-forest, data-scie..."
48170117,vba experi hope way without use macro program ...,record api ticker data excel tabl,,"[excel, api, store, record, ticker]"
48178451,'m work exercis 14.2-4 clrs intro algorithm 3e...,asymptot run time print key red-black tree fal...,"RB - ENUMERATE ( x , a , b ) T = red - black t...","[algorithm, time-complexity, binary-tree, bina..."
48077459,code json data arraylist 'm take one parent he...,pass one json two differ data expand listview,timez _listDataHeader _listdataChild LinearLay...,"[android, expandablelistview]"
48151565,'m tri display imag folder imag doe exist want...,skip imag doe n't exist carousel,"< div class ="" w3 - content w3 - section "" sty...","[javascript, html]"


In [110]:
kmeans = KMeans(n_clusters=500, random_state=0).fit(X)

In [115]:
mask = kmeans.labels_ == 45
popularitydict = {}
for tags in list(dataset['PTags'][mask]):
    for tag in tags:
        if tag in popularitydict.keys():
            popularitydict[tag] += 1
        else:
            popularitydict[tag] = 1
df = pd.DataFrame(list(popularitydict.items()))
df.columns = ('Tag', 'Count')
df.set_index('Tag', inplace = True)
df.sort_values('Count', ascending=False)

Unnamed: 0_level_0,Count
Tag,Unnamed: 1_level_1
python,3
android,2
javascript,2
email,2
if-statement,1
r,1
physics,1
gmail,1
automation,1
mongodb,1


In [112]:
for index, text in dataset[['PText', 'Title']][mask].iterrows():
    print(text['Title'])
    print()
    print(text['PText'])
    print('\n\n')

Twilio - Using taskrouter.js and reservation.conference() how to not beep and end conference

 I am using task router to assign an incoming call task to a worker. When the worker gets the reservation I am starting a conference like this: There is not much documentation for how to handle a conference with taskrouter.js, but this seems to work to start the conference. There are 2 problems I am having: I can't stop the 'entering conference' beep to not play When both the worker and participant exit the conference the conference is not actually ended and therefore not putting the worker into the after work activity state. Any help would be appreciated.



How to set multiple scroll views size to change dynamically relative to each other?

 I have two scroll views in a vertical linear layout.
I want them to be relative to each other so that they fill the entire linear layout and compensate if one cant cover half the screen. Lets call that scroll views TOP and BOT.
If the screen can display 

In [75]:
CV= CountVectorizer(tokenizer=None, vocabulary=None)
bw = CV.fit_transform(dataset['TText'])
LDA = LatentDirichletAllocation(learning_method = 'batch')
mat = LDA.fit(bw)

In [77]:
LDA.components_.shape

(10, 3886)

In [81]:
len(CV.get_feature_names())

3886

In [87]:
temp = CV.transform(dataset['TText'].iloc[[4]])
temp.shape

(1, 3886)

In [90]:
ldatex = LDA.transform(temp)
ldatex.shape

(1, 10)

In [91]:
LDA.inverse_transform(ldatex)

AttributeError: 'LatentDirichletAllocation' object has no attribute 'inverse_transform'