In [111]:
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import MultinomialNB

In [72]:
# experimenting with glob to import files
glob.glob('bench/binarytrees/*.*')

['bench/binarytrees/binarytrees.ats',
 'bench/binarytrees/binarytrees.ats-3.ats',
 'bench/binarytrees/binarytrees.clojure',
 'bench/binarytrees/binarytrees.clojure-2.clojure',
 'bench/binarytrees/binarytrees.clojure-6.clojure',
 'bench/binarytrees/binarytrees.csharp',
 'bench/binarytrees/binarytrees.csharp-2.csharp',
 'bench/binarytrees/binarytrees.dart',
 'bench/binarytrees/binarytrees.erlang',
 'bench/binarytrees/binarytrees.erlang-2.erlang',
 'bench/binarytrees/binarytrees.fpascal',
 'bench/binarytrees/binarytrees.fsharp',
 'bench/binarytrees/binarytrees.fsharp-2.fsharp',
 'bench/binarytrees/binarytrees.fsharp-3.fsharp',
 'bench/binarytrees/binarytrees.gcc',
 'bench/binarytrees/binarytrees.gcc-2.gcc',
 'bench/binarytrees/binarytrees.gcc-3.gcc',
 'bench/binarytrees/binarytrees.gcc-5.gcc',
 'bench/binarytrees/binarytrees.gcc-7.gcc',
 'bench/binarytrees/binarytrees.gcc-9.gcc',
 'bench/binarytrees/binarytrees.ghc',
 'bench/binarytrees/binarytrees.ghc-4.ghc',
 'bench/binarytrees/binarytr

In [73]:
#dictionary with names of languages and their file extentions
file_extentions = {"C": ["gcc", "c", "h"],
                    "C#": ["csharp"],
                    "Clojure": ["clj", "cljs", "cljs", "edn", "clojure"],
                    "Common Lisp": ["sbcl"],
                    "Haskell": ["hs", "lhs", "ghc"],
                    "Java": ["java", "class", "jar"],
                    "Javascript": ["js", "javascript"],
                    "OCaml": ["ocaml", "ml"],
                    "Perl": ["pl", "pm", "t", "pod", "perl"],
                    "PHP": ["php", "phtml", "php4", "php3", "php5", "phps", "hack"],
                    "Python": ["py", "pyw", "pyc", "pyo", "pyd", "python3", "Python2"],
                    "Ruby": ["rb", "rbw", "jruby", "yarv"],
                    "Scala": ["scala"],
                    "Scheme": ["scm", "ss", "racket"]}

In [74]:
file_extentions.items()


dict_items([('C', ['gcc', 'c', 'h']), ('Clojure', ['clj', 'cljs', 'cljs', 'edn', 'clojure']), ('Perl', ['pl', 'pm', 't', 'pod', 'perl']), ('Java', ['java', 'class', 'jar']), ('Python', ['py', 'pyw', 'pyc', 'pyo', 'pyd', 'python3', 'Python2']), ('OCaml', ['ocaml', 'ml']), ('PHP', ['php', 'phtml', 'php4', 'php3', 'php5', 'phps', 'hack']), ('Javascript', ['js', 'javascript']), ('Scala', ['scala']), ('Common Lisp', ['sbcl']), ('Ruby', ['rb', 'rbw', 'jruby', 'yarv']), ('C#', ['csharp']), ('Scheme', ['scm', 'ss', 'racket']), ('Haskell', ['hs', 'lhs', 'ghc'])])

In [80]:
def get_extention(ext):
    '''
    iterates through the extensions list.
    if it finds the ext in the lists of values,
    the key for those values is returned.
    '''
    for key, value in file_extentions.items():
        if ext in value:
            return key

In [81]:
def read_train_files():
    '''
    opens training files.
    for each file it uses the get_extention function
    to find the text after the "." and calls that the ext.
    ,and returns the value which is appended to the file content itself
    '''
    files = glob.glob('bench/binarytrees/*.*')
    texts = []
    for file in files:
        ext = get_extention(file.split(".")[-1])
        with open(file) as f:
            if ext != None:
                texts.append((f.read(), ext))
    return texts

In [82]:
read_train_files()

[(';; The Computer Language Benchmarks Game\n;; http://benchmarksgame.alioth.debian.org/\n;\n;; Adapted from the Java -server version\n;\n;; contributed by Marko Kocic\n;; modified by Kenneth Jonsson, restructured to allow usage of \'pmap\'\n;; modified by Andy Fingerhut to use faster primitive math ops, and\n;; deftype instead of defrecord for smaller tree nodes.\n;; modified by Rich Hickey for Clojure 1.3\n\n(ns binarytrees\n  (:gen-class))\n\n(set! *warn-on-reflection* true)\n(set! *unchecked-math* true)\n\n(definterface ITreeNode\n  (^long item [])\n  (left [])\n  (right []))\n\n;; These TreeNode\'s take up noticeably less memory than a similar one\n;; implemented using defrecord.\n\n(deftype TreeNode [left right ^long item]\n  ITreeNode\n  (item [this] item)\n  (left [this] left)\n  (right [this] right))\n\n(defn bottom-up-tree [^long item ^long depth]\n  (if (zero? depth)\n    (TreeNode. nil nil item)\n    (TreeNode.\n     (bottom-up-tree (dec (* 2 item))\n                     (d

In [83]:
train = read_train_files()
train = pd.DataFrame(train, columns = ["code", "language"])


In [87]:
train.head(7)

Unnamed: 0,code,language
0,;; The Computer Language Benchmarks Game\n;; h...,Clojure
1,;; The Computer Language Benchmarks Game\n;; h...,Clojure
2,;; The Computer Language Benchmarks Game\n;; h...,Clojure
3,﻿/*\n The Computer Language Benchmarks Ga...,C#
4,/* The Computer Language Benchmarks Game\n h...,C#
5,/* The Computer Language Benchmarks Game\n * h...,C
6,/* \n * The Computer Language Benchmarks Game ...,C


In [101]:
train.language.value_counts()

Ruby           9
PHP            7
C              6
Haskell        3
Scheme         3
Scala          3
OCaml          3
Clojure        3
Java           3
C#             2
Common Lisp    2
Perl           2
Javascript     1
Python         1
dtype: int64

In [116]:
X = train.loc[:,["code"]]
X.head(7)

Unnamed: 0,code
0,;; The Computer Language Benchmarks Game\n;; h...
1,;; The Computer Language Benchmarks Game\n;; h...
2,;; The Computer Language Benchmarks Game\n;; h...
3,﻿/*\n The Computer Language Benchmarks Ga...
4,/* The Computer Language Benchmarks Game\n h...
5,/* The Computer Language Benchmarks Game\n * h...
6,/* \n * The Computer Language Benchmarks Game ...


In [114]:
y = train.loc[:, "language"]
y.head(7)

0    Clojure
1    Clojure
2    Clojure
3         C#
4         C#
5          C
6          C
Name: language, dtype: object

In [117]:
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [118]:
vectorizer.fit(X, y)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [119]:
classifier = MultinomialNB
classifier.fit(X, y)

TypeError: fit() missing 1 required positional argument: 'y'

In [120]:
classifier.score(X, y)

TypeError: score() missing 1 required positional argument: 'y'