In [0]:
import pandas as pd
import re
import random
from ast import literal_eval
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score


In [171]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
url = 'https://github.com/snehavishwanatha/Predict_tags_on_stackoverflow_queries/raw/master/data/train.tsv'
df = pd.read_csv(url, sep='\t')

url1 = 'https://github.com/snehavishwanatha/Predict_tags_on_stackoverflow_queries/raw/master/data/validation.tsv'
df1 = pd.read_csv(url1, sep='\t')

In [0]:
df['tags'] = df['tags'].apply(literal_eval)
df1['tags'] = df1['tags'].apply(literal_eval)

In [197]:
print(df1.head(10))
print(len(df1))

                                               title                         tags
0                         Why odbc_exec always fail?                   [php, sql]
1  Access a base classes variable from within a c...                 [javascript]
2  Content-Type "application/json" not required i...        [ruby-on-rails, ruby]
3         Sessions in Sinatra: Used to Pass Variable              [ruby, session]
4  Getting error - type "json" does not exist - i...  [ruby-on-rails, ruby, json]
5                        library not found for.....?    [c++, iphone, ios, xcode]
6  .csproj File - Programmatic adding/deleting files                         [c#]
7  TypeError: makedirs() got an unexpected keywor...             [python, django]
8                      How to Pan a div using JQuery   [javascript, jquery, html]
9          Hibernate intermediate/advanced tutorials            [java, hibernate]
30000


In [0]:
train = df
test = df1

In [176]:
train.head(10)

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]
5,Awesome nested set plugin - how to add new chi...,[ruby-on-rails]
6,How to create map from JSON response in Ruby o...,"[ruby, ruby-on-rails-3, json]"
7,rspec test if method is called,[ruby]
8,SpringBoot Catalina LifeCycle Exception,"[java, spring, spring-mvc]"
9,How to import data from excel to mysql databas...,"[php, codeigniter]"


In [177]:
test.head()

Unnamed: 0,title,tags
0,Why odbc_exec always fail?,"[php, sql]"
1,Access a base classes variable from within a c...,[javascript]
2,"Content-Type ""application/json"" not required i...","[ruby-on-rails, ruby]"
3,Sessions in Sinatra: Used to Pass Variable,"[ruby, session]"
4,"Getting error - type ""json"" does not exist - i...","[ruby-on-rails, ruby, json]"


In [0]:
x_train, y_train = train['title'].values, train['tags'].values
x_test, y_test = test['title'].values, test['tags'].values

In [0]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE," ",text,)
    text = re.sub(BAD_SYMBOLS_RE,"",text)
    text = re.sub(r'\s+'," ",text)
    text = " ".join([word for word in text.split(" ") if word not in STOPWORDS])
    return text

In [180]:
x_train = [text_prepare(x) for x in x_train]
x_train[0]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app',
 'get current time specific country via jquery',
 'configuring tomcat use ssl',
 'awesome nested set plugin add new children tree various levels',
 'create map json response ruby rails 3',
 'rspec test method called',
 'springboot catalina lifecycle exception',
 'import data excel mysql database using php',
 'obtaining object javalangclasst object parameterized type without constructing class q_uestion',
 'ipad selecting text inside text input tap',
 'jquerys function object',
 'eclipse c++ mingw lauch program terminated',
 'javascript call one prototype method another prototype method',
 'get intersection list sets',
 'longer able hide keyboard viewwilldisappear ios7',
 'fetch key json swift',
 'change pivot header template windows phone 8',
 'connectionstring encryption',
 'let ui refresh long running ui operation',
 'better way execute ruby file using python get

In [0]:
x_test = [text_prepare(x) for x in x_test]

In [182]:
y_train

array([list(['r']), list(['php', 'mysql']), list(['c#']), ...,
       list(['python', 'datetime', 'pandas']),
       list(['javascript', 'jquery']), list(['java', 'list', 'generics'])],
      dtype=object)

In [183]:
tfidf_converter = TfidfVectorizer(max_features=500, norm='l2', stop_words='english')
tfidf_converter.fit(df['title'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=500,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [0]:
train_tfidf =  tfidf_converter.transform(x_train)
test_tfidf =  tfidf_converter.transform(x_test)

In [185]:
tags_counts = Counter()

for tags in y_train:
    for tag in tags:
        tags_counts[tag] += 1
tags_counts

Counter({'.net': 3872,
         'ajax': 1767,
         'algorithm': 419,
         'android': 2818,
         'angularjs': 1353,
         'apache': 441,
         'arrays': 2277,
         'asp.net': 3939,
         'asp.net-mvc': 1244,
         'c': 3119,
         'c#': 19077,
         'c++': 6469,
         'class': 509,
         'cocoa-touch': 507,
         'codeigniter': 786,
         'css': 1769,
         'csv': 435,
         'database': 740,
         'date': 560,
         'datetime': 557,
         'django': 1835,
         'dom': 400,
         'eclipse': 992,
         'entity-framework': 649,
         'excel': 443,
         'facebook': 508,
         'file': 582,
         'forms': 872,
         'function': 487,
         'generics': 420,
         'google-maps': 408,
         'hibernate': 807,
         'html': 4668,
         'html5': 842,
         'image': 672,
         'ios': 3256,
         'iphone': 1909,
         'java': 18661,
         'javascript': 19078,
         'jquery': 7510,
    

In [186]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_trained = mlb.fit_transform(y_train)
y_trained[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [187]:
lr = LogisticRegression(C=1.0, penalty='l2', max_iter=300)
    
ovr = OneVsRestClassifier(lr)
model = ovr.fit(train_tfidf, y_trained)



In [0]:
predictions_for_test = ovr.predict(test_tfidf)

In [189]:
predictions_for_test[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [0]:
pred_inversed = mlb.inverse_transform(predictions_for_test)

In [191]:
model.predict_proba(test_tfidf)

array([[0.04097413, 0.00407445, 0.00461728, ..., 0.0023936 , 0.00616019,
        0.00513486],
       [0.05880381, 0.00230241, 0.00146285, ..., 0.00354927, 0.00812356,
        0.0121891 ],
       [0.00516707, 0.0334915 , 0.00087614, ..., 0.00093897, 0.00148021,
        0.00286439],
       ...,
       [0.07987029, 0.0203893 , 0.00217145, ..., 0.00442057, 0.00827259,
        0.00401999],
       [0.05197969, 0.00334852, 0.00192232, ..., 0.00106714, 0.00513187,
        0.00131743],
       [0.04497839, 0.01690949, 0.0019655 , ..., 0.00153242, 0.01806684,
        0.00428785]])

In [196]:
#NULL lables generated is high - noted
nc = 0 
for i in range(0,len(list(pred_inversed))):
  if not len(list(pred_inversed[i])):
    nc = nc + 1
  #print(y_test[i],list(pred_inversed[i]))
nc

12878

In [193]:
accuracy = 0
#print(len(list(pred_inversed)))
for i in range(0,len(list(pred_inversed))):
  #print(pred_inversed[i])
  for j in range(0,len(list(pred_inversed[i]))):
    #print(i, j, pred_inversed[i][j], y_test[i])
    if pred_inversed[i][j] in y_test[i]:
      accuracy = accuracy + 1
      #print("A", accuracy)
      break
print("ACCURACY", accuracy/len(list(pred_inversed)))

ACCURACY 0.5207333333333334
