In [1]:
import re
import os
import nltk
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from tqdm import tqdm
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from gensim.models.keyedvectors import KeyedVectors
word_vect = KeyedVectors.load_word2vec_format("SO_vectors_200.bin", binary=True)

In [2]:
idToTagIndex = {}          #dict mapping post ID to a list of tag indices
tagToTagIndex = {}         #dict mapping tag to tag index

In [89]:
import math
def truncate(number, digits) -> float:
    stepper = 10.0 ** digits
    return math.trunc(stepper * number) / stepper

In [51]:
with open("data/Questions-ascii-1M.csv") as question_input:
    questions_data = pd.read_csv(question_input, engine='python')

    print(questions_data.columns)
    questions_data = questions_data[['Id', 'Body']]
    questions_data.insert(len(questions_data.columns), 'Code', "")
    
    a = re.compile(r'<pre><code>([^<]*)</code></pre>')
    b = re.compile(r'<.*?>')
    questions_data['Code'] = questions_data['Body'].apply(lambda x: ' '.join(re.findall(a, x)))

    def clean(text):
        x = re.sub(a, '', text)
        x = re.sub(b, '', x)
        x = x.replace('\n\n', '\n')
        return x
    questions_data['Body'] = questions_data['Body'].apply(clean)
    
#     questions_data['Body'] = questions_data['Body'].apply(nltk.tokenize.word_tokenize) #need to fix: don't convert
#     # C#, C++ to C
#     questions_data['Body'] = questions_data['Body'].apply(lambda x: [word for word in x if word.isalnum()])
#     questions_data['Body'] = questions_data['Body'].apply(lambda x: [word.lower() for word in x])

Index(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title',
       'Body'],
      dtype='object')


In [52]:
from collections import defaultdict

idToTagIndex = {}          #dict mapping post ID to a list of tag indices
tagToTagIndex = {}         #dict mapping tag to tag index
tagIndexToTag = {}
tagToFrequency = defaultdict(lambda: 0)

with open("data/tags-1M.csv") as tag_input:
    tag_data = pd.read_csv(tag_input)
    tagIndex = 0
    for index, row in tqdm(tag_data.iterrows()):
        currId = int(row[0])
        currTag = row[1]
        if currTag not in tagToTagIndex:
            tagToTagIndex[currTag] = tagIndex
            tagIndexToTag[tagIndex] = currTag
            currTagIndex = tagIndex
            tagIndex += 1

        else:
            currTagIndex = tagToTagIndex[currTag]  
        
        tagToFrequency[currTagIndex] += 1        
    
        if currId not in idToTagIndex.keys():
            idToTagIndex[currId] = [tagToTagIndex[row[1]]]
        else:
            idToTagIndex[currId].append(tagToTagIndex[row[1]])
            

152759it [00:14, 10367.46it/s]


In [53]:
print("Number of examples: ", len(idToTagIndex))

Number of examples:  53203


In [54]:
# find 10 most common tags

tagToFrequencyList = []

for key, value in tagToFrequency.items():
    temp = [key, value]
    tagToFrequencyList.append(temp)
    
tagToFrequencyList.sort(reverse=True, key=lambda x: x[1])

for tag in tagToFrequencyList[:10]:
    print(f"{tagIndexToTag[tag[0]]} ({tag[0]}): {tag[1]} times")
    
mostCommonTags = {}
for counter, tag in enumerate(tagToFrequencyList[:10]):   #currently takes top 10 tags
    mostCommonTags[tag[0]] = counter


c# (14): 6722 times
java (89): 3858 times
.net (15): 3598 times
php (76): 3223 times
asp.net (8): 3041 times
javascript (132): 2852 times
c++ (18): 2509 times
jquery (370): 2198 times
iphone (607): 2111 times
python (196): 2070 times


In [55]:
# list(idToTagIndex.values())[:10]
print(mostCommonTags)

{14: 0, 89: 1, 15: 2, 76: 3, 8: 4, 132: 5, 18: 6, 370: 7, 607: 8, 196: 9}


In [56]:
idToTenTags = {}

for postId, tags in idToTagIndex.items():
    containsTopTenTags = [mostCommonTags[tag] for tag in tags if tag in mostCommonTags.keys()]
    idToTenTags[postId] = containsTopTenTags
    
questions_data['Top-Tags'] = questions_data['Id'].apply(lambda x: idToTenTags[x])

print(questions_data)

            Id                                               Body  \
0           80  I've written a database generation script in S...   
1           90  Are there any really good tutorials explaining...   
2          120  Has anyone got experience creating SQL-based A...   
3          180  This is something I've pseudo-solved many time...   
4          260  I have a little game written in C#. It uses a ...   
...        ...                                                ...   
53198  2495810  I'm going to host an app on a shared host and ...   
53199  2495870  I use MouseMove event to move objects(say labe...   
53200  2495890  I have dragged a empty asp.net table onto my w...   
53201  2495910  I want to log some seemingly random errors I'm...   
53202  2496040  I have a php array that has a bunch of data th...   

                                                    Code Top-Tags  
0      Create Table tRole (\n      roleID integer Pri...       []  
1                                  

In [105]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(questions_data['Body'].values.tolist())
vectorizer.adapt(text_ds)
vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'to', 'i', 'a', 'is', 'and', 'in', 'of']

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

questions_X = vectorizer(np.array([[s] for s in questions_data['Body'].values])).numpy()
questions_y = mlb.fit_transform(np.array(questions_data['Top-Tags'].values))
print(questions_y[:5])

train_X, test_X, train_y, test_y = train_test_split(questions_X, questions_y, train_size=0.75, random_state=200)

[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0 0]]


In [59]:
output = vectorizer([["I tried running this line of code, but I'm receiving a null pointer exception"]])
output.numpy()

array([[   4,  124,  176,   12,  166,    9,   40,   15,   26, 1701,    5,
         393,  848,  291,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [60]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [61]:
test = ["this", "line", "of", "code"]
[word_index[w] for w in test] 

[12, 166, 9, 40]

In [62]:
num_tokens = len(voc) + 2
embedding_dim = 200
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = word_vect.get_vector(word)
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    except:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18657 words (1343 misses)


In [63]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [64]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(mostCommonTags))(x) #change to all tags
preds = layers.Softmax(axis=-1)(preds)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "functional_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 200)         4000400   
_________________________________________________________________
conv1d_23 (Conv1D)           (None, None, 128)         128128    
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, None, 128)         0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_18 (MaxPooling (None, None, 128)         0         
_________________________________________________________________
conv1d_25 (Conv1D)           (None, None, 128)       

In [68]:
from sklearn.utils import class_weight

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

weights = {}

for i in range(10):
    weights[i] = (1 / tagToFrequencyList[i][1])

model.fit(train_X, train_y, batch_size=128, epochs=20, validation_data=(test_X, test_y), class_weight=weights)

[[0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
6722
3858
3598
3223
3041
2852
2509
2198
2111
2070
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
 23/312 [=>............................] - ETA: 47s - loss: 3.4699e-05 - accuracy: 0.4399

KeyboardInterrupt: 

In [104]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    ["I wanted to open my app without safari system alert but I found out that is impossible. so i decided to handle this alert event but I couldn't find the way. if I click [open], then safari open App, but if I click [cancel], then 'appCheckTimer' will be executed, then safari moves to 'some page's url'. if there is no way to not open this alert, I want to handle this alert's button event, when user click [cancel], I just want to stay that page. that alert is not opened by me, it's by safari So I can't handle it."]
#     ["Dropped my iphone again :("]
)

for i, prob in np.ndenumerate(probabilities):
    print('{:<16}  {:<16}'.format(tagIndexToTag[tagToFrequencyList[i[1]][0]], truncate(prob, 3)))
print(f"\nMost likely tag: {tagIndexToTag[tagToFrequencyList[np.argmax(probabilities)][0]]}")

c#                0.018           
java              0.029           
.net              0.009           
php               0.018           
asp.net           0.016           
javascript        0.284           
c++               0.003           
jquery            0.498           
iphone            0.117           
python            0.002           

Most likely tag: jquery
