In [14]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd

# ruby files
ruby_train_0 = r'ruby/ruby_train_0.jsonl'
ruby_train_1 = r'ruby/ruby_train_1.jsonl'
ruby_valid_0 = r'ruby/ruby_valid_0.jsonl'

# go files
go_train_0 = r'go/go_train_0.jsonl'
go_train_1 = r'go/go_train_1.jsonl'
go_train_2 = r'go/go_train_2.jsonl'
go_train_3 = r'go/go_train_3.jsonl'
go_train_4 = r'go/go_train_4.jsonl'
go_train_5 = r'go/go_train_5.jsonl'
go_train_6 = r'go/go_train_6.jsonl'
go_train_7 = r'go/go_train_7.jsonl'
go_train_8 = r'go/go_train_8.jsonl'
go_train_9 = r'go/go_train_9.jsonl'
go_train_10 = r'go/go_train_10.jsonl'
go_valid_0 = r'go/go_valid_0.jsonl'

# ruby dataframes
df_ruby_train_0 = pd.read_json(ruby_train_0, lines=True)
df_ruby_train_1 = pd.read_json(ruby_train_1, lines=True)
df_ruby_valid_0 = pd.read_json(ruby_valid_0, lines=True)

# go dataframes
df_go_train_0 = pd.read_json(go_train_0, lines=True)
df_go_train_1 = pd.read_json(go_train_1, lines=True)
df_go_train_2 = pd.read_json(go_train_2, lines=True)
df_go_train_3 = pd.read_json(go_train_3, lines=True)
df_go_train_4 = pd.read_json(go_train_4, lines=True)
df_go_train_5 = pd.read_json(go_train_5, lines=True)
df_go_train_6 = pd.read_json(go_train_6, lines=True)
df_go_train_7 = pd.read_json(go_train_7, lines=True)
df_go_train_8 = pd.read_json(go_train_8, lines=True)
df_go_train_9 = pd.read_json(go_train_9, lines=True)
df_go_train_10 = pd.read_json(go_train_1, lines=True)
df_go_valid_0 = pd.read_json(go_valid_0, lines=True)

In [7]:
ruby_test_0 = r'ruby/ruby_test_0.jsonl'
go_test_0 = r'go/go_test_0.jsonl'

df_ruby_test_0 = pd.read_json(ruby_test_0, lines=True)
df_go_test_0 = pd.read_json(go_test_0, lines=True)

In [8]:
df_test = pd.concat([df_ruby_test_0, df_go_test_0], axis=0)
df_test.shape


(16570, 12)

<div id="section">
    <h3>
        Main dataframe concatenation
    </h3>
    <p>
        Dataframes formed in the previous section are now concatenated to allow vectorizing using a tf-idf vactorizer. They are later input to the NB model. Data consists train + validation records, as mentioned in the problem definition.
    </p>
</div>

In [3]:
df = pd.concat([df_ruby_train_0, df_ruby_train_1, df_ruby_valid_0], axis=0)
df = pd.concat([df, df_go_train_0, df_go_train_1, df_go_train_2, df_go_train_3, df_go_train_4, 
                df_go_train_5, df_go_train_6, df_go_train_7, df_go_train_8, df_go_train_9,
                df_go_train_10], axis=0)
df.shape

(381000, 12)

In [4]:
df.columns

Index(['repo', 'path', 'func_name', 'original_string', 'language', 'code',
       'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url',
       'partition'],
      dtype='object')

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorizer(df):
    df['combined'] = df['code'] + df['docstring']
    vectorizer = TfidfVectorizer(max_features=1000)
    x = vectorizer.fit_transform(df['combined'])
    y = df['language']
    return x, y

In [10]:
x_train, y_train = vectorizer(df)

<381000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 9907340 stored elements in Compressed Sparse Row format>

<div id="section">
    <h3>
        Test dataframe concatenation
    </h3>
    <p>
        Similar to the training data, the test data is also vectorized and the classification reports are formed using this data.
    </p>
</div>

In [11]:
x_test, y_test = vectorizer(df_test)

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_model = MultinomialNB()
nb_model.fit(x_train, y_train)

In [13]:
y_pred = nb_model.predict(x_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_true=y_test, y_pred=y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

          go       0.86      0.99      0.92     14291
        ruby       0.15      0.02      0.03      2279

    accuracy                           0.85     16570
   macro avg       0.51      0.50      0.47     16570
weighted avg       0.76      0.85      0.80     16570

