In [159]:
import pandas as pd
import re
import random
from ast import literal_eval
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer


In [160]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/sneha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [161]:
url = 'https://github.com/snehavishwanatha/Predict_tags_on_stackoverflow_queries/raw/master/data/train.tsv'
df = pd.read_csv(url, sep='\t')

url1 = 'https://github.com/snehavishwanatha/Predict_tags_on_stackoverflow_queries/raw/master/data/validation.tsv'
df1 = pd.read_csv(url1, sep='\t')

In [162]:
df['tags'] = df['tags'].apply(literal_eval)
df1['tags'] = df1['tags'].apply(literal_eval)

In [163]:
print(df1.head(10))
print(len(df1))

                                               title  \
0                         Why odbc_exec always fail?   
1  Access a base classes variable from within a c...   
2  Content-Type "application/json" not required i...   
3         Sessions in Sinatra: Used to Pass Variable   
4  Getting error - type "json" does not exist - i...   
5                        library not found for.....?   
6  .csproj File - Programmatic adding/deleting files   
7  TypeError: makedirs() got an unexpected keywor...   
8                      How to Pan a div using JQuery   
9          Hibernate intermediate/advanced tutorials   

                          tags  
0                   [php, sql]  
1                 [javascript]  
2        [ruby-on-rails, ruby]  
3              [ruby, session]  
4  [ruby-on-rails, ruby, json]  
5    [c++, iphone, ios, xcode]  
6                         [c#]  
7             [python, django]  
8   [javascript, jquery, html]  
9            [java, hibernate]  
30000


In [164]:
train = df
test = df1

In [165]:
train.head(10)

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]
5,Awesome nested set plugin - how to add new chi...,[ruby-on-rails]
6,How to create map from JSON response in Ruby o...,"[ruby, ruby-on-rails-3, json]"
7,rspec test if method is called,[ruby]
8,SpringBoot Catalina LifeCycle Exception,"[java, spring, spring-mvc]"
9,How to import data from excel to mysql databas...,"[php, codeigniter]"


In [166]:
test.head()

Unnamed: 0,title,tags
0,Why odbc_exec always fail?,"[php, sql]"
1,Access a base classes variable from within a c...,[javascript]
2,"Content-Type ""application/json"" not required i...","[ruby-on-rails, ruby]"
3,Sessions in Sinatra: Used to Pass Variable,"[ruby, session]"
4,"Getting error - type ""json"" does not exist - i...","[ruby-on-rails, ruby, json]"


In [167]:
x_train, y_train = train['title'].values, train['tags'].values
x_test, y_test = test['title'].values, test['tags'].values

In [168]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE," ",text,)
    text = re.sub(BAD_SYMBOLS_RE,"",text)
    text = re.sub(r'\s+'," ",text)
    text = " ".join([word for word in text.split(" ") if word not in STOPWORDS])
    return text

In [169]:
x_train = [text_prepare(x) for x in x_train]
x_train[0]

'draw stacked dotplot r'

In [170]:
x_test = [text_prepare(x) for x in x_test]

In [171]:
y_train

array([list(['r']), list(['php', 'mysql']), list(['c#']), ...,
       list(['python', 'datetime', 'pandas']),
       list(['javascript', 'jquery']), list(['java', 'list', 'generics'])],
      dtype=object)

In [172]:
tfidf_converter = TfidfVectorizer(max_features=500, norm='l2', stop_words='english')
tfidf_converter.fit(df['title'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [173]:
train_tfidf =  tfidf_converter.transform(x_train)
test_tfidf =  tfidf_converter.transform(x_test)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [174]:
tags_counts = Counter()

for tags in y_train:
    for tag in tags:
        tags_counts[tag] += 1
tags_counts

Counter({'r': 1727,
         'php': 13907,
         'mysql': 3092,
         'c#': 19077,
         'javascript': 19078,
         'jquery': 7510,
         'java': 18661,
         'ruby-on-rails': 3344,
         'ruby': 2326,
         'ruby-on-rails-3': 692,
         'json': 2026,
         'spring': 1346,
         'spring-mvc': 618,
         'codeigniter': 786,
         'class': 509,
         'html': 4668,
         'ios': 3256,
         'c++': 6469,
         'eclipse': 992,
         'python': 8940,
         'list': 693,
         'objective-c': 4338,
         'swift': 1465,
         'xaml': 438,
         'asp.net': 3939,
         'wpf': 1289,
         'multithreading': 1118,
         'image': 672,
         'performance': 512,
         'twitter-bootstrap': 501,
         'linq': 964,
         'xml': 1347,
         'numpy': 502,
         'ajax': 1767,
         'django': 1835,
         'laravel': 525,
         'android': 2818,
         'rest': 456,
         'asp.net-mvc': 1244,
         'web-s

In [175]:
mlb = MultiLabelBinarizer()

y_trained = pd.DataFrame(mlb.fit_transform(y_train),
                   columns=tags_counts.keys())
y_trained.head()

Unnamed: 0,r,php,mysql,c#,javascript,jquery,java,ruby-on-rails,ruby,ruby-on-rails-3,...,windows,pointers,oop,datetime,servlets,session,cocoa-touch,apache,selenium,maven
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [176]:
clf = RandomForestClassifier(max_depth=600,random_state=50, 
                             criterion='entropy', warm_start=True)


In [177]:
model = clf.fit(train_tfidf, y_trained)  

In [178]:
predictions = clf.predict(test_tfidf)

In [179]:
predictions[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [180]:
pred_inversed = mlb.inverse_transform(predictions)
for i in range(0,30):
    print(x_test[i], y_test[i], pred_inversed[i])

odbc_exec always fail ['php', 'sql'] ()
access base classes variable within child class ['javascript'] ()
contenttype application json required rails ['ruby-on-rails', 'ruby'] ('ruby-on-rails',)
sessions sinatra used pass variable ['ruby', 'session'] ()
getting error type json exist postgresql rake db migrate ['ruby-on-rails', 'ruby', 'json'] ('java', 'json')
library found ['c++', 'iphone', 'ios', 'xcode'] ()
csproj file programmatic adding deleting files ['c#'] ()
typeerror makedirs got unexpected keyword argument exists_ok ['python', 'django'] ('forms', 'python')
pan div using jquery ['javascript', 'jquery', 'html'] ('javascript', 'jquery')
hibernate intermediate advanced tutorials ['java', 'hibernate'] ('hibernate', 'java')
c# properties access values another function class ['c#', '.net', 'class'] ()
use databinding 3d elements like visual3d uielement3d ['c#', 'wpf'] ('javascript',)
check video exists youtube using php ['php'] ('php',)
handle mouse dragging event jfreechart ['java']

In [181]:
#NULL lables generated is high - noted
nc = 0 
for i in range(0,len(list(pred_inversed))):
    if not len(list(pred_inversed[i])):
        nc = nc + 1
  #print(y_test[i],list(pred_inversed[i]))
nc

10888

In [182]:
accuracy = 0
#print(len(list(pred_inversed)))
for i in range(0,len(list(pred_inversed))):
  #print(pred_inversed[i])
  for j in range(0,len(list(pred_inversed[i]))):
    #print(i, j, pred_inversed[i][j], y_test[i])
    if pred_inversed[i][j] in y_test[i]:
      accuracy = accuracy + 1
      #print("A", accuracy)
      break
print("ACCURACY", accuracy/len(list(pred_inversed)))

ACCURACY 0.5376
