# Create Labeled Code Snippets Dataset

The dataset was created and preprocessed following [this kernel](https://www.kaggle.com/amalhasni/creating-labeled-code-snippets-dataset) at kaggle and [this blog](https://towardsdatascience.com/classification-model-for-source-code-programming-languages-40d1ab7243c2) at towardsdatascience. The code for model was copied from [this](https://github.com/amal-hasni/spot-language) github repository.

In [1]:
%cd '/content/drive/Shareddrives/With Myself/huge-datasets-for-colab/labeled-code-snippets-dataset'

/content/drive/Shareddrives/With Myself/huge-datasets-for-colab/labeled-code-snippets-dataset


## Imports

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [22]:
from sklearn.metrics import confusion_matrix

## Data

In [4]:
data = pd.read_csv('data.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
data.head(10)

Unnamed: 0,content,language
0,// Generated by typings\n// Source: https://ra...,TypeScript
1,/*********************************************...,TypeScript
2,// instantiating a derived type can cause an i...,TypeScript
3,export class BootstrapUI {\n progress;\n\n ...,TypeScript
4,// Type definitions for Angular JS 1.5\n// Pro...,TypeScript
5,// Type definitions for JW Player\n// Project:...,TypeScript
6,// Copyright (c) 2016 Tracktunes Inc\n\n/**\n ...,TypeScript
7,"<?xml version=""1.0"" ?><!DOCTYPE TS><TS languag...",TypeScript
8,class AudioOutputBuffer {\n\n constructor(\...,TypeScript
9,"<?xml version=""1.0"" encoding=""utf-8""?>\n<!DOCT...",TypeScript


In [6]:
data['content'].iloc[0]

"// Generated by typings\n// Source: https://raw.githubusercontent.com/typed-typings/npm-mime/c7c5810698b7eaa421702a53644a8963d372f758/mime.d.ts\ndeclare module '~mime/mime' {\nclass Mime {\n  types: {\n    [extension: string]: string;\n  }\n\n  extensions: {\n    [extension: string]: string;\n  }\n\n  define (map: Map): void;\n  load (filename: string): void;\n  lookup (path: string, fallback?: string): string;\n  extension (mimeType: string): string;\n}\n\ninterface Map {\n  [type: string]: string[];\n}\n\nvar mime: Mime & {\n  default_type: string;\n  charsets: {\n    lookup (mimeType: string, fallback?: string): string;\n  }\n  Mime: typeof Mime;\n}\n\nexport = mime;\n}\ndeclare module 'mime/mime' {\nimport alias = require('~mime/mime');\nexport = alias;\n}\ndeclare module 'mime' {\nimport alias = require('~mime/mime');\nexport = alias;\n}\n"

In [7]:
data.isna().sum()

content     6440214
language    6444130
dtype: int64

In [8]:
len(data)

6575588

In [9]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [10]:
data.shape

(131458, 2)

## Processing, Model building and training

In [11]:
token_pattern = r'''([A-Za-z_]\w*\b|[!\#\$%\&\*\+:\-\./<=>\?@\\\^_\|\~]+|[ \t\(\),;\{\}\[\]`"'])'''

In [13]:
def preprocess(x):
    return pd.Series(x).replace(r'\b([A-Za-z])\1+\b', '', regex=True).replace(r'\b[A-Za-z]\b', '', regex=True)

In [14]:
X, y = data.content, data.language
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
transformer = FunctionTransformer(preprocess)
vectorizer = TfidfVectorizer(token_pattern=token_pattern, max_features=3000)
clf = RandomForestClassifier(n_jobs=4)

In [21]:
vectorizer.get_feature_names()

['\t',
 ' ',
 '!',
 '!!',
 '!$',
 '!.',
 '!</',
 '!=',
 '!==',
 '!\\',
 '!_',
 '"',
 '#',
 '#!',
 '#!/',
 '##',
 '###',
 '####',
 '#-',
 '$',
 '$$',
 '$->',
 '$.',
 '$/',
 '$=',
 '$@',
 '$\\',
 '$_',
 '%',
 '%%',
 '%%%',
 '%.',
 '%/',
 '%:',
 '%\\',
 '&',
 '&#',
 '&&',
 '&&.',
 '&->',
 '&=',
 "'",
 '(',
 ')',
 '*',
 '**',
 '***',
 '**/',
 '*.',
 '*/',
 '*=',
 '*>',
 '*\\',
 '*_',
 '+',
 '++',
 '+++',
 '+.',
 '+/',
 '+//',
 '+=',
 '+\\',
 ',',
 '-',
 '-#',
 '-$',
 '--',
 '---',
 '-----------------------------------',
 '-->',
 '-.',
 '-:',
 '-=',
 '->',
 '->_',
 '-\\',
 '.',
 '.$',
 '.&',
 '.*',
 '.-',
 '.--',
 '..',
 '...',
 '...</',
 '...\\',
 '../',
 '../../',
 '../../../',
 '../../../../',
 '../../../../../',
 '../../../../../../',
 './',
 '.<',
 '.</',
 '.</>',
 '.=',
 '.>',
 '.\\',
 '.\\%',
 '._',
 '.__',
 '/',
 '/#',
 '/$',
 '/&',
 '/*',
 '/*!',
 '/**',
 '/+',
 '/-',
 '/.',
 '//',
 '//!',
 '//#',
 '//+',
 '///',
 '////',
 '/=',
 '/>',
 '/>.',
 '/><',
 '/></',
 '/>\\',
 '/?',
 '/\\

In [16]:
pipe_RF = Pipeline([
    ('preprocessing', transformer),
    ('vectorizer', vectorizer),
    ('clf', clf)]
)

In [17]:
best_params = {
    'clf__criterion': 'gini',
    'clf__max_features': 'sqrt',
    'clf__min_samples_split': 3,
    'clf__n_estimators': 300
}

In [18]:
pipe_RF.set_params(**best_params)

Pipeline(memory=None,
         steps=[('preprocessing',
                 FunctionTransformer(accept_sparse=False, check_inverse=True,
                                     func=<function preprocess at 0x7f0858266950>,
                                     inv_kw_args=None, inverse_func=None,
                                     kw_args=None, validate=False)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='conte...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_d

In [19]:
pipe_RF.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 FunctionTransformer(accept_sparse=False, check_inverse=True,
                                     func=<function preprocess at 0x7f0858266950>,
                                     inv_kw_args=None, inverse_func=None,
                                     kw_args=None, validate=False)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='conte...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_d

In [20]:
print(f'Accuracy: {pipe_RF.score(X_test, y_test)}')

Accuracy: 0.9656549520766773


In [24]:
y_pred = pipe_RF.predict(X_test)

In [25]:
confusion_matrix(y_test, y_pred)

array([[ 622,    1,    0, ...,    0,    0,    0],
       [   0,  950,    0, ...,    0,    0,    0],
       [   1,    0, 1031, ...,    0,    1,    0],
       ...,
       [   1,    0,    0, ...,  484,    0,    0],
       [   0,    0,    1, ...,    0,  931,    0],
       [   0,    0,    0, ...,    0,    0,  322]])