In [2]:
import csv
import glob
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Polyglot
This project was designed to deepen our understanding of feature extraction, classification, and building a robust classifier. Specifically, the classifier that this project uses can take snippets of code and guesses the programming language of the code.

##### Files are read in and set as the training data.

In [3]:
file_extensions = {'gcc': 'c', 'c': 'c', 'csharp': 'c#', 'sbcl': 'common lisp',
                   'clojure': 'clojure', 'hs': 'haskell', 'java': 'java',
                   'javascript': 'javascript', 'ocaml': 'ocaml', 'perl':
                   'perl', 'hack': 'php', 'php': 'php', 'python3': 'python',
                   'jruby': 'ruby', 'yarv': 'ruby', 'scala': 'scala',
                   'racket': 'scheme', 'tcl': 'tcl'}

In [4]:
def read_program_files(file_locations):
    files = glob.glob(file_locations, recursive=True)
    texts = []
    for file in files:
        with open(file, encoding='latin_1') as f:
            texts.append(f.read())
    return texts

In [5]:
X_train = []
y_train = []
for key, value in file_extensions.items():
    X = read_program_files('benchmarksgame/bench/**/*.{}'.format(key))
    X_train += X
    y_train += (len(X) * [value])


##### A counter was used to ensure there were not too many files of any one language used to train the classifier.  Ruby and JavaScript stand out as  needing fewer and more files, respectively, to improve the testing set.

In [6]:
from collections import Counter
Counter(y_train)

Counter({'c': 59,
         'c#': 41,
         'clojure': 38,
         'common lisp': 34,
         'haskell': 52,
         'java': 51,
         'javascript': 25,
         'ocaml': 35,
         'perl': 34,
         'php': 55,
         'python': 36,
         'ruby': 73,
         'scala': 43,
         'scheme': 29,
         'tcl': 52})

##### Pipeline created to fit and transform the data through the CountVectorizer estimator before transforming the data with the MulitnomialNB estimator.

In [7]:
pip = Pipeline([
        ('vect', CountVectorizer(analyzer='word', token_pattern=r'\w{2,}|\s{2,}|[^\w\d\s]')),
        ('clf', MultinomialNB()),
    ])

In [8]:
pip.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{2,}|\\s{2,}|[^\\w\\d\\s]',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

##### The mean accuracy of the training files and labels provided to train the classifier.
This score should be close to 1.0, so the score of 0.98 for this test data shows that the classifier is ready to use on unseen files.

In [9]:
pip.score(X_train, y_train)

0.98325722983257224

##### Files are read in and set as testing data and labels. 

In [13]:
X_test = []
for num in range(1, 33):
    X = read_program_files('test/{}'.format(num))
    X_test += X

In [15]:
y_test = []
with open('test.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        y_test.append(row[1])

##### The mean accuracy of the test files and labels provided.
A decent score for testing data should be around 0.8.  This score is slightly lower than would be preferred;  however, it is most likely due to the overlapping similarities between many of the languages.

In [16]:
pip.score(X_test, y_test)

0.78125

##### The first list of languages is what language the classifier is predicting each file is written in. The second list is the programming language each file was actually written in.  Comparing the two, it is clear that this classifier has the most difficulty distinguishing java files.  

In [17]:
print(pip.predict(X_test))
print(y_test)

['clojure' 'clojure' 'clojure' 'clojure' 'python' 'python' 'ruby' 'python'
 'javascript' 'javascript' 'scala' 'scala' 'ruby' 'ruby' 'ruby' 'haskell'
 'haskell' 'tcl' 'scheme' 'scheme' 'scheme' 'c' 'c' 'scala' 'scala' 'tcl'
 'tcl' 'c' 'php' 'php' 'ocaml' 'ocaml']
['clojure', 'clojure', 'clojure', 'clojure', 'python', 'python', 'python', 'python', 'javascript', 'javascript', 'javascript', 'javascript', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'scheme', 'scheme', 'scheme', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']
