In [17]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

Helper function to remove comment lines from code represented as string

In [18]:
def removeCommentLines(string):
    str = ''
    lines = string.split('\n')
    for line in lines:
        # Keep the Shebang line
        if line[0:2] == "#!":
            str += line
        # Also keep existing empty lines
        elif not line.strip():
            str += line
        # But remove comments from other lines
        else:
            line = line.split('#')
            stripped_string = line[0].rstrip()
            # Write the line only if the comment was after the code.
            # Discard lines that only contain comments.
            if stripped_string:
                str += stripped_string
                str += '\n'
    return str

Load json files

In [19]:
with open('index_to_code.json') as f:
    index_to_code = json.load(f)
with open('index_to_topics.json') as f:
    index_to_topics = json.load(f)
with open('all_topics.json') as f:
    topics = json.load(f)

Create dictionaries from json files and Split to train/test samples

In [20]:
index_to_code = {int(k):v for k,v in index_to_code.items()}
index_to_topics = {int(k):v for k,v in index_to_topics.items()}
code_train, code_test, topic_train, topic_test = train_test_split(index_to_code, index_to_topics, random_state=42, test_size=0.33, shuffle=True)

Convert to pandas dataframes

In [21]:
train_df = pd.DataFrame({'code':pd.Series(code_train),'topic':pd.Series(topic_train)})
test_df = pd.DataFrame({'code':pd.Series(code_test),'topic':pd.Series(topic_test)})

Remove commentaries

In [23]:
for idx, row in train_df.iterrows():
    row['code'] = removeCommentLines(row['code'])
    train_df.set_value(idx, 'code', row['code'])

for idx, row in test_df.iterrows():
    row['code'] = removeCommentLines(row['code'])
    test_df.set_value(idx, 'code', row['code'])

  This is separate from the ipykernel package so we can avoid doing imports until


  import sys


Use OneVsTheRest multi-label classifier

In [24]:
NB_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])

Get dict of number (from 0 to number of topics) to name of category (e.g 1 -> i/o, 2 -> loop etc)

In [25]:
topicss = dict.fromkeys(topics, 0)
k = 0
for topic in topicss.keys():
    topicss[topic] = k
    k = k + 1

Prepare data to fit the model by replacing topic names to numbers

In [26]:
y = train_df['topic'].tolist()
y_test = test_df['topic'].tolist()

for lis in y:
    for i, topic in enumerate(lis):
        lis[i] = topicss[topic]

for lis in y_test:
    for i, topic in enumerate(lis):
        lis[i] = topicss[topic]
        
x = np.array(train_df['code'])

Use MultiLabelBinarizer to transform multi labels to matrix

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer
MLB = MultiLabelBinarizer()
y = MLB.fit_transform(y)


Fit-predict 

In [29]:
NB_pipeline.fit(x, y)
prediction = NB_pipeline.predict(np.array(test_df['code']))

Write prediction results to file, but at first convert themes back to human-readable format

In [16]:
#print(MLB.inverse_transform(prediction))
predictionn = MLB.inverse_transform(prediction)
# Swap key<->value in dict
topicss = dict((v,k) for k,v in topicss.items())

for j, topic_nums in enumerate(predictionn):
    topic_names = [v for v in topic_nums]
    topic_names = sorted(topic_names)
    for i, name in enumerate(topic_names):
        topic_names[i] = topicss[name]
    predictionn[j] = topic_names

for j, topic_nums in enumerate(y_test):
    topic_names = sorted(topic_nums)
    for i, name in enumerate(topic_names):
        topic_names[i] = topicss[name]
    y_test[j] = topic_names

import csv

with open("predicted.txt","w") as f:
    wr = csv.writer(f)
    wr.writerows(prediction)

with open("actual.txt","w") as f:
    wr = csv.writer(f)
    wr.writerows(y_test)

KeyError: 21

Count prediction accuracy by using accuracy_score

In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.


In [15]:
y_t = MultiLabelBinarizer().fit_transform(y_test)
print('Test accuracy is {}'.format(accuracy_score(y_t, prediction)))

Test accuracy is 4.9987503124218944e-05
