In [37]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, classification_report, accuracy_score

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelEncoder

import re
import time

In [2]:
def get_data():
    file_name = './LanguageSamples.txt'
    rawdata = open(file_name, 'r')
    lines = rawdata.readlines()
    return lines

In [3]:
#data_lines = get_data()

In [4]:
all_samples = ''.join(get_data())


In [5]:
def clean_data(input_lines):
    #find matches for all data within the pre tags
    all_found = re.findall(r'<pre[\s\S]*?<\/pre>', input_lines, re.MULTILINE)
    
    #clean the string of various tags
    clean_string = lambda x: x.replace('&lt;', '<').replace('&gt;', '>').replace('</pre>', '').replace('\n', '')
    all_found = [clean_string(item) for item in all_found]
    
    #get the language for all of the pre tags
    get_language = lambda x: re.findall(r'<pre lang="(.*?)">', x, re.MULTILINE)[0]
    lang_items = [get_language(item) for item in all_found]
    
    #remove all of the pre tags that contain the languare
    remove_lang = lambda x: re.sub(r'<pre lang="(.*?)">', "", x)
    all_found = [remove_lang(item) for item in all_found]
    
    #return let text between the pre tags and their corresponding language
    return (all_found, lang_items)

In [6]:
cleaned_data, languages = clean_data(all_samples)

In [12]:
df = pd.DataFrame()

In [13]:
lb_enc = LabelEncoder()

In [17]:
df['lang_text'] = languages
df['language'] = lb_enc.fit_transform(df['lang_text'])
df['data'] = cleaned_data

In [18]:
df.head(10)

Unnamed: 0,lang_text,data,language
0,XML,"<?xml version=""1.0""?><DevelopmentStorage xmlns...",26
1,Swift,@objc func handleTap(sender: UITapGestureRecog...,23
2,JavaScript,"var my_dataset = [ { id: ""1"", te...",9
3,Javascript,var my_dataset = [ { id...,10
4,Javascript,"var my_dataset = [ { ""id"": 1, ...",10
5,C#,public class AppIntents_Droid : IAppIntent...,3
6,Python,# Import `tensorflow` and `pandas`import tenso...,17
7,Python,# Setup feature columnscolumns_feat = [ tf....,17
8,Python,# Define train functiondef train_function(inpu...,17
9,Python,# Define evaluation functiondef evaluation_fun...,17


In [129]:
lb_enc.classes_

array(['ASM', 'ASP.NET', 'Angular', 'C#', 'C++', 'CSS', 'Delphi', 'HTML',
       'Java', 'JavaScript', 'Javascript', 'ObjectiveC', 'PERL', 'PHP',
       'Pascal', 'PowerShell', 'Powershell', 'Python', 'Razor', 'React',
       'Ruby', 'SQL', 'Scala', 'Swift', 'TypeScript', 'VB.NET', 'XML'], dtype=object)

In [20]:
def test_models(X_train_input_raw, y_train_input, X_test_input_raw, y_test_input, models_dict):

    return_trained_models = {}
    
    return_vectorizer = FeatureUnion([('tfidf_vect', TfidfVectorizer())])
    
    X_train = return_vectorizer.fit_transform(X_train_input_raw)
    X_test = return_vectorizer.transform(X_test_input_raw)
    
    for key in models_dict:
        model_name = key
        model = models_dict[key]
        t1 = time.time()
        model.fit(X_train, y_train_input)
        t2 = time.time()
        predicted_y = model.predict(X_test)
        t3 = time.time()
        
        output_accuracy(y_test_input, predicted_y, model_name, t2 - t1, t3 - t2)        
        return_trained_models[model_name] = model
        
    return (return_trained_models, return_vectorizer)


In [21]:
def create_models():
    models = {}
    models['LinearSVC'] = LinearSVC()
    models['LogisticRegression'] = LogisticRegression()
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['DecisionTreeClassifier'] = DecisionTreeClassifier()
    models['MultinomialNB'] = MultinomialNB()
    return models


In [22]:
X_input, y_input = shuffle(df['data'], df['language'], random_state=7)

In [23]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_input, y_input, test_size=0.7)

In [24]:
models = create_models()
trained_models, fitted_vectorizer = test_models(X_train_raw, y_train, X_test_raw, y_test, models)

NameError: name 'output_accuracy' is not defined

In [116]:

test_code = [df['data'][17]]
transformed_test_code = fitted_vectorizer.transform(test_code)
trained_models['DecisionTreeClassifier'].predict(transformed_test_code) 

array([22])

In [86]:
cleaned_data[2:3]

['var my_dataset = [   {       id: "1",       text: "Chairman &amp; CEO",       title: "Henry Bennett"   },   {       id: "2",       text: "Manager",       title: "Mildred Kim"   },   {       id: "3",       text: "Technical Director",       title: "Jerry Wagner"   },   { id: "1-2", from: "1", to: "2", type: "line" },   { id: "1-3", from: "1", to: "3", type: "line" }];']

In [59]:
all_found = re.findall(r'<pre[\s\S]*?<\/pre>', all_samples, re.MULTILINE)

In [60]:
clean_string = lambda x: x.replace('&lt;', '<').replace('&gt;', '>').replace('</pre>', '').replace('\n', '')

In [61]:
all_found = [clean_string(item) for item in all_found]

In [62]:
get_language = lambda x: re.findall(r'<pre lang="(.*?)">', x, re.MULTILINE)[0]

In [63]:
lang_items = [get_language(item) for item in all_found]

In [64]:
remove_lang = lambda x: re.sub(r'<pre lang="(.*?)">', "", x)

In [65]:
all_found = [remove_lang(item) for item in all_found]

In [69]:
all_found[4]

'var my_dataset = [       {          "id": 1,          "text": "item: 1",       },       {          "id": 2,          "text": "item: 2",          "parent": 1,          "dir": "vertical"       },       {          "id": 3,          "text": "item: 3",          "parent": 14       },       {          "id": 7,          "text": "item: 7",          "parent": 14       },       {          "id": 14,          "text": "item: 14",          "parent": 2       },       {          "id": 19,          "text": "item: 19",          "parent": 2       }];'

In [14]:
cat_df = pd.DataFrame()
cat_df['lang_text'] = lang_items

In [15]:
lb_enc = LabelEncoder()
cat_df['language'] = lb_enc.fit_transform(cat_df['lang_text'])
cat_df.head(10)


Unnamed: 0,lang_text,language
0,XML,26
1,Swift,23
2,JavaScript,9
3,Javascript,10
4,Javascript,10
5,C#,3
6,Python,17
7,Python,17
8,Python,17
9,Python,17


In [16]:
cat_df.dtypes

lang_text    object
language      int64
dtype: object

In [17]:
lb_enc.classes_

array(['ASM', 'ASP.NET', 'Angular', 'C#', 'C++', 'CSS', 'Delphi', 'HTML',
       'Java', 'JavaScript', 'Javascript', 'ObjectiveC', 'PERL', 'PHP',
       'Pascal', 'PowerShell', 'Powershell', 'Python', 'Razor', 'React',
       'Ruby', 'SQL', 'Scala', 'Swift', 'TypeScript', 'VB.NET', 'XML'], dtype=object)

In [18]:
print(len(all_found))

2031


In [19]:
all_found[0]

'<?xml version="1.0"?><DevelopmentStorage xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2009-03-18">  <SQLInstance>(localdb)\\v11.0</SQLInstance>  <PageBlobRoot>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\PageBlobRoot</PageBlobRoot>  <BlockBlobRoot>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\BlockBlobRoot</BlockBlobRoot>  <LogPath>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\Logs</LogPath>  <LoggingEnabled>false</LoggingEnabled></DevelopmentStorage>'

In [20]:
test = '<pre>test</pre>more<pre>test1</pre'

In [21]:
#matches = p.findall(all_samples)

In [22]:
#print(matches)

In [23]:
# for m in matches:
#     print(m)

In [24]:
all_found = re.findall(r'<pre[\s\S]*?<\/pre>', all_samples, re.MULTILINE)

In [25]:
all_found

['<pre lang="XML">\n&lt;?xml version="1.0"?&gt;\n&lt;DevelopmentStorage xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2009-03-18"&gt;\n  &lt;SQLInstance&gt;(localdb)\\v11.0&lt;/SQLInstance&gt;\n  &lt;PageBlobRoot&gt;C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\PageBlobRoot&lt;/PageBlobRoot&gt;\n  &lt;BlockBlobRoot&gt;C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\BlockBlobRoot&lt;/BlockBlobRoot&gt;\n  &lt;LogPath&gt;C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\Logs&lt;/LogPath&gt;\n  &lt;LoggingEnabled&gt;false&lt;/LoggingEnabled&gt;\n&lt;/DevelopmentStorage&gt;</pre>',
 '<pre lang="Swift">\n@objc func handleTap(sender: UITapGestureRecognizer) {\n    if let tappedSceneView = sender.view as? ARSCNView {\n        let tapLocationInView = sender.location(in: tappedSceneView)\n        let planeHitTest = tappedSceneView.hitTest(tapLocationInView,\n            types: .existingPlaneUsingExtent)\n        if !pla

In [26]:
for m in all_found: 
    print(m)
    print('----------------')

<pre lang="XML">
&lt;?xml version="1.0"?&gt;
&lt;DevelopmentStorage xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2009-03-18"&gt;
  &lt;SQLInstance&gt;(localdb)\v11.0&lt;/SQLInstance&gt;
  &lt;PageBlobRoot&gt;C:\Users\Carl\AppData\Local\DevelopmentStorage\PageBlobRoot&lt;/PageBlobRoot&gt;
  &lt;BlockBlobRoot&gt;C:\Users\Carl\AppData\Local\DevelopmentStorage\BlockBlobRoot&lt;/BlockBlobRoot&gt;
  &lt;LogPath&gt;C:\Users\Carl\AppData\Local\DevelopmentStorage\Logs&lt;/LogPath&gt;
  &lt;LoggingEnabled&gt;false&lt;/LoggingEnabled&gt;
&lt;/DevelopmentStorage&gt;</pre>
----------------
<pre lang="Swift">
@objc func handleTap(sender: UITapGestureRecognizer) {
    if let tappedSceneView = sender.view as? ARSCNView {
        let tapLocationInView = sender.location(in: tappedSceneView)
        let planeHitTest = tappedSceneView.hitTest(tapLocationInView,
            types: .existingPlaneUsingExtent)
        if !planeHitTest.isEmpty {
  

In [27]:
x = all_found[0]


In [28]:
x

'<pre lang="XML">\n&lt;?xml version="1.0"?&gt;\n&lt;DevelopmentStorage xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2009-03-18"&gt;\n  &lt;SQLInstance&gt;(localdb)\\v11.0&lt;/SQLInstance&gt;\n  &lt;PageBlobRoot&gt;C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\PageBlobRoot&lt;/PageBlobRoot&gt;\n  &lt;BlockBlobRoot&gt;C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\BlockBlobRoot&lt;/BlockBlobRoot&gt;\n  &lt;LogPath&gt;C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\Logs&lt;/LogPath&gt;\n  &lt;LoggingEnabled&gt;false&lt;/LoggingEnabled&gt;\n&lt;/DevelopmentStorage&gt;</pre>'

In [29]:
x = x.replace('&lt;', '<').replace('&gt;', '>').replace('</pre>', '').replace('\n', '')

In [30]:
print(x)

<pre lang="XML"><?xml version="1.0"?><DevelopmentStorage xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2009-03-18">  <SQLInstance>(localdb)\v11.0</SQLInstance>  <PageBlobRoot>C:\Users\Carl\AppData\Local\DevelopmentStorage\PageBlobRoot</PageBlobRoot>  <BlockBlobRoot>C:\Users\Carl\AppData\Local\DevelopmentStorage\BlockBlobRoot</BlockBlobRoot>  <LogPath>C:\Users\Carl\AppData\Local\DevelopmentStorage\Logs</LogPath>  <LoggingEnabled>false</LoggingEnabled></DevelopmentStorage>


In [31]:
re.findall(r'<pre lang="(.*?)">', x, re.MULTILINE)

['XML']

In [32]:
x = re.sub(r'<pre lang="(.*?)">', "", x)

In [33]:
x

'<?xml version="1.0"?><DevelopmentStorage xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2009-03-18">  <SQLInstance>(localdb)\\v11.0</SQLInstance>  <PageBlobRoot>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\PageBlobRoot</PageBlobRoot>  <BlockBlobRoot>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\BlockBlobRoot</BlockBlobRoot>  <LogPath>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\Logs</LogPath>  <LoggingEnabled>false</LoggingEnabled></DevelopmentStorage>'

In [34]:
splitted = x.split('\n')

In [35]:
indicator = splitted[0]
x1 = splitted[1:]

In [36]:
x1

[]

In [37]:
res = ''.join(x1)

In [38]:
res

''

In [39]:
indicator

'<?xml version="1.0"?><DevelopmentStorage xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2009-03-18">  <SQLInstance>(localdb)\\v11.0</SQLInstance>  <PageBlobRoot>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\PageBlobRoot</PageBlobRoot>  <BlockBlobRoot>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\BlockBlobRoot</BlockBlobRoot>  <LogPath>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\Logs</LogPath>  <LoggingEnabled>false</LoggingEnabled></DevelopmentStorage>'

In [40]:
#<pre lang="XML">
#re.findall(r'<pre lang="\d">', indicator, re.)

In [41]:
re.findall(r'<pre lang="(.*?)">', indicator, re.MULTILINE)


[]

In [42]:
line = re.sub(r'<pre lang="(.*?)">', "", indicator)




In [43]:
line



'<?xml version="1.0"?><DevelopmentStorage xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="2009-03-18">  <SQLInstance>(localdb)\\v11.0</SQLInstance>  <PageBlobRoot>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\PageBlobRoot</PageBlobRoot>  <BlockBlobRoot>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\BlockBlobRoot</BlockBlobRoot>  <LogPath>C:\\Users\\Carl\\AppData\\Local\\DevelopmentStorage\\Logs</LogPath>  <LoggingEnabled>false</LoggingEnabled></DevelopmentStorage>'

In [44]:

cat_df['language']




0       26
1       23
2        9
3       10
4       10
5        3
6       17
7       17
8       17
9       17
10       9
11       9
12       9
13       9
14       9
15       7
16       7
17      22
18       9
19      19
20       2
21      19
22      19
23      19
24      19
25      22
26      22
27      19
28      22
29      22
        ..
2001     5
2002     5
2003     5
2004     5
2005     5
2006     5
2007    14
2008    11
2009    23
2010    23
2011    23
2012     6
2013    14
2014    14
2015    14
2016    14
2017    11
2018    11
2019    11
2020    11
2021    12
2022    12
2023    12
2024     6
2025     6
2026     6
2027    11
2028    11
2029    11
2030    11
Name: language, Length: 2031, dtype: int64

In [45]:
all_found[5]


'<pre lang="C#">\n    public class AppIntents_Droid : IAppIntents\n    {\n        public void HandleWebviewUri(string uri)\n        {\n            var appUri = Android.Net.Uri.Parse(uri);\n            var appIntent = new Intent(Intent.ActionView, appUri);\n            Application.Context.StartActivity(appIntent);\n        }\n    }</pre>'

In [25]:
def output_accuracy(actual_y, predicted_y, model_name, train_time, predict_time):
    print('Model Name: ' + model_name)
    print('Train time: ', round(train_time, 2))
    print('Predict time: ', round(predict_time, 2))
    print('Model Accuracy: {:.4f}'.format(accuracy_score(actual_y, predicted_y)))
    #print('Model Precision: {:.4f}'.format(precision_score(actual_y, predicted_y, average=None)))
    print('')
    print(classification_report(actual_y, predicted_y, digits=4))
    print("=========================================================================")
    

In [38]:
def test_models(X_train_input_raw, y_train_input, X_test_input_raw, y_test_input, models_dict):

    return_trained_models = {}
    
    return_vectorizer = FeatureUnion([('count_vect', CountVectorizer()), ('tfidf_vect', TfidfVectorizer())])
    
    X_train = return_vectorizer.fit_transform(X_train_input_raw)
    X_test = return_vectorizer.transform(X_test_input_raw)
    
    for key in models_dict:
        model_name = key
        model = models_dict[key]
        t1 = time.time()
        model.fit(X_train, y_train_input)
        t2 = time.time()
        predicted_y = model.predict(X_test)
        t3 = time.time()
        
        output_accuracy(y_test_input, predicted_y, model_name, t2 - t1, t3 - t2)        
        return_trained_models[model_name] = model
        
    return (return_trained_models, return_vectorizer)


In [39]:
def create_models():
    models = {}
    models['LinearSVC'] = LinearSVC()
    models['LogisticRegression'] = LogisticRegression()
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['DecisionTreeClassifier'] = DecisionTreeClassifier()
    models['MultinomialNB'] = MultinomialNB()
    return models


In [40]:
X_input, y_input = shuffle(df['data'], df['language'], random_state=7)




In [41]:
#X_input, y_input = shuffle(df['', df['language'], random_state=7)

In [42]:
len(all_found)

NameError: name 'all_found' is not defined

In [43]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_input, y_input, test_size=0.7)




In [44]:
models = create_models()
trained_models, fitted_vectorizer = test_models(X_train_raw, y_train, X_test_raw, y_test, models)




Model Name: LinearSVC
Train time:  0.02
Predict time:  0.0
Model Accuracy: 0.6835

             precision    recall  f1-score   support

          0     0.0000    0.0000    0.0000         3
          1     0.0000    0.0000    0.0000         1
          3     0.6713    0.8889    0.7649       108
          4     0.7867    0.7564    0.7712        78
          5     0.7778    0.7000    0.7368        10
          6     0.0000    0.0000    0.0000         3
          7     0.5185    0.5833    0.5490        24
          8     0.0000    0.0000    0.0000        11
          9     0.5517    0.5818    0.5664        55
         10     0.0000    0.0000    0.0000         2
         11     0.0000    0.0000    0.0000         6
         12     1.0000    0.5000    0.6667         2
         13     0.0000    0.0000    0.0000         1
         14     0.0000    0.0000    0.0000         4
         15     1.0000    1.0000    1.0000         2
         16     0.0000    0.0000    0.0000         1
         17    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Model Name: LogisticRegression
Train time:  0.17
Predict time:  0.0
Model Accuracy: 0.7089

             precision    recall  f1-score   support

          0     0.0000    0.0000    0.0000         3
          1     0.0000    0.0000    0.0000         1
          3     0.6644    0.9167    0.7704       108
          4     0.7973    0.7564    0.7763        78
          5     0.7778    0.7000    0.7368        10
          6     0.0000    0.0000    0.0000         3
          7     0.5625    0.7500    0.6429        24
          8     0.0000    0.0000    0.0000        11
          9     0.6111    0.6000    0.6055        55
         10     0.0000    0.0000    0.0000         2
         11     0.0000    0.0000    0.0000         6
         12     1.0000    0.5000    0.6667         2
         13     0.0000    0.0000    0.0000         1
         14     0.0000    0.0000    0.0000         4
         15     1.0000    1.0000    1.0000         2
         16     0.0000    0.0000    0.0000         1
      

  'recall', 'true', average, warn_for)


In [53]:

dummies = pd.get_dummies(cat_df['lang_text'])

