In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
df=pd.read_csv("stackoverflow.csv",index_col=0)
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [4]:
df["Text"].iloc[0]

'aspnet site maps has anyone got experience creating sqlbased aspnet sitemap providersi have got the default xml file websitemap working properly with my menu and sitemappath controls but i will need a way for the users of my site to create and modify pages dynamicallyi need to tie page viewing permissions into the standard aspnet membership system as well'

In [5]:
df["Tags"].iloc[0]

"['sql', 'asp.net']"

In [6]:
df.iloc[0,0]

'aspnet site maps has anyone got experience creating sqlbased aspnet sitemap providersi have got the default xml file websitemap working properly with my menu and sitemappath controls but i will need a way for the users of my site to create and modify pages dynamicallyi need to tie page viewing permissions into the standard aspnet membership system as well'

In [7]:
type(df.iloc[0]["Tags"])
# we need to convert this str to list

str

In [8]:
import ast

In [9]:
ast.literal_eval(df.iloc[0]["Tags"])

['sql', 'asp.net']

In [10]:
type(ast.literal_eval(df.iloc[0]["Tags"]))

list

In [11]:
ast.literal_eval(df.iloc[0]["Tags"])

['sql', 'asp.net']

In [12]:
df["Tags"]=df["Tags"].apply(lambda x: ast.literal_eval(x))

In [13]:
type(df.iloc[0]["Tags"])
# now, we have our Tags type as list

list

#### MultilabelBinarizer usage example

In [14]:
# #importing MultiLabelBinarizer
# from sklearn.preprocessing import MultiLabelBinarizer
# #instantiating MultiLabelBinarizer
# mlb = MultiLabelBinarizer()
# #Encode the multilabel data in MLB Format
# genre_mlb = mlb.fit_transform(df_MLB['genre'])
# print(genre_mlb)
# print(mlb.classes_)

In [15]:
multilabel=MultiLabelBinarizer()
y=multilabel.fit_transform(df["Tags"])
y

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
classes=multilabel.classes_
print(df["Tags"].iloc[0])
print("classes:", classes)
print(y[0])
print(f"Total class: {len(classes)}")

['sql', 'asp.net']
classes: ['.net' 'android' 'asp.net' 'c' 'c#' 'c++' 'css' 'html' 'ios' 'iphone'
 'java' 'javascript' 'jquery' 'mysql' 'objective-c' 'php' 'python' 'ruby'
 'ruby-on-rails' 'sql']
[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
Total class: 20


In [17]:
pd.DataFrame(y,columns=classes)

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [18]:
tfidf=TfidfVectorizer(analyzer="word",max_features=1000,ngram_range=(1,3),stop_words="english")
#Trigram
X=tfidf.fit_transform(df["Text"])

In [19]:
X

<48976x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 1239765 stored elements in Compressed Sparse Row format>

In [20]:
tfidf.vocabulary_
# there is frequency for each word in vocab

{'aspnet': 75,
 'site': 801,
 'got': 392,
 'creating': 221,
 'default': 239,
 'xml': 998,
 'file': 347,
 'working': 988,
 'properly': 675,
 'menu': 554,
 'need': 576,
 'way': 975,
 'users': 942,
 'create': 217,
 'modify': 563,
 'pages': 618,
 'page': 617,
 'standard': 827,
 'adding': 34,
 'functionality': 374,
 'net': 580,
 'applications': 61,
 'little': 513,
 'game': 376,
 'written': 995,
 'uses': 943,
 'database': 229,
 'wanted': 973,
 'implement': 429,
 'function': 373,
 'mean': 550,
 'interface': 458,
 'class': 160,
 'implements': 433,
 'public': 683,
 'contains': 200,
 'called': 131,
 'make': 538,
 'thing': 886,
 'like': 500,
 'source': 812,
 'code': 166,
 'compile': 179,
 'use': 937,
 'just': 477,
 'add': 32,
 'tell': 878,
 'application': 60,
 'assembly': 76,
 'means': 551,
 'possible': 649,
 'new': 582,
 'language': 485,
 'extra': 330,
 'write': 993,
 'script': 763,
 'public class': 684,
 'source code': 813,
 'nested': 579,
 'classes': 161,
 'case': 137,
 'collection': 167,
 'us

In [21]:
X.shape, y.shape

((48976, 1000), (48976, 20))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
X_train.shape, X_test.shape

((39180, 1000), (9796, 1000))

#### Model: Logistic regression

In [24]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score as js

In [25]:
lr=LogisticRegression(solver="lbfgs")

In [26]:
def jaccard_score(y_true,y_pred):
    jaccard=np.minimum(y_true,y_pred).sum(axis=1)/np.maximum(y_true,y_pred).sum(axis=1)
    return jaccard.mean()*100

In [27]:
clf = OneVsRestClassifier(lr)
print( clf.fit(X_train, y_train) )

OneVsRestClassifier(estimator=LogisticRegression())


In [28]:
y_pred = clf.predict(X_test)

In [29]:
jaccard_score(y_test, y_pred)

49.19014563767524

#### SVM 

In [30]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C = 1.5, penalty='l1', dual = False)
clf = OneVsRestClassifier(svm)

In [31]:
print(clf.fit(X_train, y_train))

OneVsRestClassifier(estimator=LinearSVC(C=1.5, dual=False, penalty='l1'))


In [32]:
y_pred = clf.predict(X_test)

print("jaccard score: ",jaccard_score(y_test, y_pred))

x = ['how to write ml code in python and java i have data but do not know how to do it']

xt = tfidf.transform(x)

prediction=clf.predict(xt)
print("Prediction: ",prediction)

print("Found classes: ",multilabel.inverse_transform(clf.predict(xt)))

import pickle

pickle.dump(clf, open('svm_multilabel.pkl', 'wb'))
pickle.dump(clf, open('tfidf-multilabel.pkl', 'wb'))

jaccard score:  53.34269089424255
Prediction:  [[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0]]
Found classes:  [('java', 'python')]


In [33]:
x = ['how to write ml code in ruby and php i have data but do not know how to do it']

In [34]:
xt = tfidf.transform(x)
prediction=clf.predict(xt)
print("Prediction: ",prediction)

print("Found classes: ",multilabel.inverse_transform(prediction))

Prediction:  [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0]]
Found classes:  [('php', 'ruby')]
