In [1]:
import json
from pathlib import Path
import pandas as pd
from datetime import datetime 
pd.set_option('max_colwidth',300)

**Mining C data from github repositories**

In [None]:
!pip install pydriller
from pydriller.repository_mining import RepositoryMining

since_date = datetime(2020, 11, 28)
to_date = datetime(2020, 11, 29)

repositories = ['https://github.com/s-matyukevich/raspberry-pi-os',
                'https://github.com/ggreer/the_silver_searcher',
                'https://github.com/boostorg/graph',
                'https://github.com/boostorg/asio.git',
                'https://github.com/boostorg/python.git']
               # 'https://github.com/vurtun/nuklear',
                #'https://github.com/pbatard/rufus',
                #'https://github.com/hashcat/hashcat',
                #'https://github.com/glfw/glfw']

messages, code, complexity, lines = [], [], [], []
for repository in repositories:
  for commit in RepositoryMining(repository).traverse_commits():
    for modified_file in commit.modifications:
      if modified_file.methods:
        messages.append(commit.msg)
        code.append(modified_file.diff)
        complexity.append(modified_file.complexity)
        lines.append(modified_file.nloc)

c_data = pd.DataFrame(columns = {'Message': [], 'Code': [], 'Lines': [], 'Complexity': []})
c_data['Message'] = messages
c_data['Code'] = code
c_data['Lines'] = lines
c_data['Complexity'] = complexity

c_data['Code'] = c_data['Code'].str.replace(r' +', ' ')
c_data['Code'] = c_data['Code'].str.replace(r'\t', '')
c_data['Message'] = c_data['Message'].str.replace(r' +', ' ')
 
c_data = c_data[c_data['Message'] != 'Initial commit']
c_data = c_data[c_data['Message'] != 'refactoring']
c_data = c_data[c_data['Message'] != 'Merge pull request']
c_data = c_data[c_data['Message'].str.len() > 20]
c_data = c_data[c_data['Message'].str.len() > 40]
c_data = c_data[c_data['Code'] != ' ']

c_data.drop_duplicates(inplace=True, subset=['Message'])
c_data.dropna(subset=['Code'], inplace=True)
print(c_data.shape)

(4194, 4)


In [None]:
c_data.head()

Unnamed: 0,Message,Code,Lines,Complexity
60,lesson1 refactored. First version of lesson1 doc created,"@@ -5,6 +5,7 @@ void kernel_main(void)\n uart_init();\n uart_send_string(""Hello, world!\r\n"");\n \n-while (1)\n+while (1) {\n uart_send(uart_recv());\n+}\n }\n",9,2
163,complete 1.2 and 1.4\n\nThey are the same files. I mingled them. But the qemu only works with single core.,"@@ -0,0 +1,14 @@\n+#include ""uart.h""\n+\n+void kernel_main(unsigned long id) {\n+ if(id == 0) {\n+ uart_init();\n+ }\n+ uart_send_string(""Hello from processor "");\n+ uart_send(id + 48);\n+ uart_send_string("".\r\n"");\n+\n+ while (1) {\n+ uart_send(uart_recv());\n+ }\n+}\n",12,3
184,exercises/rs: Update lesson01/4 to use qemu support for mini UART,"@@ -1,4 +1,4 @@\n-#include ""uart.h""\n+#include ""mini_uart.h""\n \n void kernel_main(void) {\n uart_init();\n",8,2
197,"exercises/rs: add lesson 03 responses\n\nQEMU port does not work yet, I still need to figure out why. I will post\na patch later.","@@ -0,0 +1,48 @@\n+#include ""peripherals/irq.h""\n+#include ""entry.h""\n+#include ""printf.h""\n+#include ""timer.h""\n+#include ""utils.h""\n+\n+const char *entry_error_messages[] = {\n+ ""SYNC_INVALID_EL1t"", ""IRQ_INVALID_EL1t"",\n+ ""FIQ_INVALID_EL1t"", ""ERROR_INVALID_EL1T"",\n+\n+ ""SYNC_INVALID_EL1h"", ""IR...",32,4
215,"exercises/rs: add lesson 4 responses for 1, 2 and 4","@@ -0,0 +1,27 @@\n+#include ""entry.h""\n+#include ""mm.h""\n+#include ""printf.h""\n+#include ""sched.h""\n+\n+int copy_process(unsigned long fn, unsigned long arg) {\n+ preempt_disable();\n+ struct task_struct *p;\n+\n+ p = (struct task_struct *)get_free_page();\n+ if (!p)\n+ return 1;\n+ p->priority ...",24,2


In [None]:
c_data.to_csv('pydriller_5k.csv')

In [5]:
!pip install pydriller
from pydriller.repository_mining import RepositoryMining

repositories = ['https://github.com/boostorg/math',
                'https://github.com/boostorg/log',
                'https://github.com/antirez/rax',
                'https://github.com/Pithikos/C-Thread-Pool',
                'https://github.com/google/flatbuffers',
                'https://github.com/oracle/python-cx_Oracle',
                'https://github.com/oracle/odpi',
                'https://github.com/oracle/dtrace-utils.git']

messages, code, complexity, lines = [], [], [], []
for repository in repositories:
  for commit in RepositoryMining(repository).traverse_commits():
    for modified_file in commit.modifications:
      if modified_file.methods:
        messages.append(commit.msg)
        code.append(modified_file.diff)
        complexity.append(modified_file.complexity)
        lines.append(modified_file.nloc)

c_data = pd.DataFrame(columns = {'Message': [], 'Code': [], 'Lines': [], 'Complexity': []})
c_data['Message'] = messages
c_data['Code'] = code
c_data['Lines'] = lines
c_data['Complexity'] = complexity

c_data['Code'] = c_data['Code'].str.replace(r' +', ' ')
c_data['Code'] = c_data['Code'].str.replace(r'\t', '')
c_data['Code'] = c_data['Code'].str.replace(r'\@@.*?@@', '')
c_data['Message'] = c_data['Message'].str.replace(r' +', ' ')
 
c_data = c_data[c_data['Message'] != 'Initial commit']
c_data = c_data[c_data['Message'] != 'refactoring']
c_data = c_data[c_data['Message'] != 'Merge pull request']
c_data = c_data[c_data['Message'].str.len() > 20]
c_data = c_data[c_data['Message'].str.len() > 40]
c_data = c_data[c_data['Code'] != ' ']

c_data.drop_duplicates(inplace=True, subset=['Message'])
c_data.dropna(subset=['Code'], inplace=True)
print(c_data.shape)

(6197, 4)


In [6]:
c_data.head()

Unnamed: 0,Message,Code,Lines,Complexity
0,"Special functions, octonions, quaternions by Hubert Holin\n\n\n[SVN r10404]","\n+// boost octonion.hpp header file\n+\n+// (C) Copyright Hubert Holin 2001. Permission to copy, use, modify, sell and\n+// distribute this software is granted provided this copyright notice appears\n+// in all copies. This software is provided ""as is"" without express or implied\n+// warranty, ...",2089,386
5,"Special functions, octonions, quaternions by Hubert Holin\n\n\n[SVN r10405]","\n+// test file for octonion.hpp\n+\n+// (C) Copyright Hubert Holin 2001. Permission to copy, use, modify, sell and\n+// distribute this software is granted provided this copyright notice appears\n+// in all copies. This software is provided ""as is"" without express or implied\n+// warranty, and ...",439,31
11,fix various compile problems\n\n\n[SVN r10484],"int test_main(int, char *[])\n numeric_limits<float>::epsilon());\n \n BOOST_CRITICAL_TEST(abs(\n- exp(::boost::quaternion<float>(0,4*::std::atan(1),0,0))+\n+ exp(::boost::quaternion<float>(0,4*::std::atan(1.0),0,0))+\n static_cast<float>(1)) <=\n numeric_limits<float>::epsilon());\n \n BOOST_C...",538,5
12,moved to namespace boost::math\nformatting changes\n\n\n[SVN r10568],"\n-// boost octonion.hpp header file\n+// boost octonion.hpp header file\n \n // (C) Copyright Hubert Holin 2001. Permission to copy, use, modify, sell and\n // distribute this software is granted provided this copyright notice appears\n\n \n namespace boost\n {\n-#define BOOST_OCTONION_ACCESSOR...",2092,386
17,moved to namespace boost::math\nformatting changed\n\n\n[SVN r10569],\n \n // explicit (if ludicrous) instanciation\n #ifndef __GNUC__\n-template class ::boost::octonion<int>;\n+template class ::boost::math::octonion<int>;\n #else\n // gcc 3.0 doesn't like the absolutely-qualified namespace\n-template class boost::octonion<int>;\n+template class boost::math::octo...,367,13


In [7]:
c_data.to_csv('pydriller_6k_filtered.csv')