In [2]:
from github import Github
import pandas as pd

In [3]:
%run settings.ipynb

token = %env token

github = Github("your token")

env: token='MY_TOKEN'


## Analyzing testing tools used by projects in the dataset

1. Analyze how many projects have package management files.
2. Retrieve package management files.
3. Analyze if there is any testing tool declared in the package management files.
4. Verify which testing tool is being used 


In [4]:
tools = pd.read_csv('dataset/de_para_tools.csv', sep=';')

tools['language'].unique()


array(['JavaScript', 'Python', 'Java', 'Go', 'Ruby', 'PHP', 'TypeScript',
       'C#'], dtype=object)

In [5]:
manager_file = {}

for t in tools['language'].unique():
    manager_file[t] = {"yes": 0, "total": 0}

manager_file

{'JavaScript': {'yes': 0, 'total': 0},
 'Python': {'yes': 0, 'total': 0},
 'Java': {'yes': 0, 'total': 0},
 'Go': {'yes': 0, 'total': 0},
 'Ruby': {'yes': 0, 'total': 0},
 'PHP': {'yes': 0, 'total': 0},
 'TypeScript': {'yes': 0, 'total': 0},
 'C#': {'yes': 0, 'total': 0}}

In [6]:
manager_package = {}

for t in tools['language'].unique():
    manager_package[t] = {}
    for p in tools[tools['language'] == t]['test_package']:
        manager_package[t][p] = 0

manager_package

{'JavaScript': {'chai': 0,
  'enzyme': 0,
  'enzyme-adapter-react-16': 0,
  'karma': 0,
  'selenium-webdriver': 0,
  'jest': 0,
  'supertest': 0,
  'enzyme-adapter-utils': 0,
  'protractor': 0,
  'ts-jest': 0,
  'nyc': 0,
  'jest-cli': 0,
  'cypress': 0,
  'ava': 0,
  'json-server': 0,
  'react-testing-library': 0,
  'cucumber': 0,
  'jsdom': 0,
  'Electron': 0,
  'Istanbul': 0,
  'Unexpected': 0,
  'Sinon.JS': 0,
  'testdouble.js': 0,
  'mocha': 0,
  'tape': 0,
  'selenium': 0,
  'Protractor': 0,
  'Nightwatch': 0,
  'Appium': 0,
  'TestCafe': 0,
  'Cypress': 0,
  'Puppeteer': 0,
  'PhantomJS': 0,
  'Percy': 0,
  'Happo': 0,
  'Gemini': 0},
 'Python': {'unittest': 0,
  'pytest': 0,
  'hypothesis': 0,
  'tox': 0,
  'mock': 0,
  'nose': 0,
  'doctest': 0},
 'Java': {'junit': 0,
  'testNG': 0,
  'EasyMock': 0,
  'mockito-all': 0,
  'mockito-core': 0,
  'scalatest': 0,
  'spring-test': 0,
  'hamcrest-all': 0},
 'Go': {'testing': 0,
  'gomega': 0,
  'ginkgo': 0,
  'assert': 0,
  'gucumber'

### TO-DO: Dataset update.

Make a filter to use only projects using defined programming languages according to the list

In [7]:
repo_full = pd.read_csv('dataset/repositories.csv', sep=';')
#repo_full.drop("Unnamed: 0", axis=1, inplace=True) 
repo_full.head()

Unnamed: 0,full_name,git_url,id,language
0,b4winckler/macvim,git://github.com/b4winckler/macvim.git,692798,C
1,php/php-src,git://github.com/php/php-src.git,1903522,C
2,antirez/redis,git://github.com/antirez/redis.git,156018,C
3,libgit2/libgit2,git://github.com/libgit2/libgit2.git,901662,C
4,torvalds/linux,git://github.com/torvalds/linux.git,2325298,C


### Total amount of projects in a given language

In [8]:
for index, row in repo_full.iterrows():
    if row['language'] in manager_file:
        manager_file[row['language']]["total"]+=1

manager_file

{'JavaScript': {'yes': 0, 'total': 81},
 'Python': {'yes': 0, 'total': 52},
 'Java': {'yes': 0, 'total': 38},
 'Go': {'yes': 0, 'total': 33},
 'Ruby': {'yes': 0, 'total': 23},
 'PHP': {'yes': 0, 'total': 21},
 'TypeScript': {'yes': 0, 'total': 15},
 'C#': {'yes': 0, 'total': 13}}

#### Package management file

In [9]:
tools[tools['language'] == 'Java']

Unnamed: 0,language,test_package,file_packages_installed
43,Java,junit,pom.xml
44,Java,testNG,pom.xml
45,Java,EasyMock,pom.xml
46,Java,mockito-all,pom.xml
47,Java,mockito-core,pom.xml
48,Java,scalatest,pom.xml
49,Java,spring-test,pom.xml
50,Java,hamcrest-all,pom.xml


### Total amount of projects that have a package management file.

In [10]:
import requests
import csv

resp = []
for index, row in repo_full.iterrows():
    packages = tools[tools['language'] == row['language']]
    arq = packages['file_packages_installed'].unique()
    if(row['language'] == 'Java'):
        if(len(arq) > 0):
            #Get a repository by its name
            repo = github.get_repo(row['full_name'])
            #print(row['full_name'])
            try:
                #Get a specific file
                content = repo.get_contents(str(arq[0]))
                print(row['full_name'])
                #Store how many projects have package management file
                manager_file[row['language']]["yes"]+=1

                #Get file content through API request
                #print('https://raw.githubusercontent.com/' + row['full_name'] + '/master/' +  content.path)
                response = requests.get('https://raw.githubusercontent.com/' + row['full_name'] + '/master/' +  content.path)
                response.encoding = 'utf-8'
                texto = response.text

                #Verify if it uses any testing package from the mapped tools
                is_test = 0
                for p in packages['test_package']:
                    if(texto.count(p)>0):
                        is_test = 1
                        manager_package[row['language']][p]+=1
                        r =  pd.DataFrame([{
                            'full_name': row['full_name'], 
                            'language': row['language'], 
                            'package': p
                        }])
                        resp.append(r)
                if(not is_test):
                    r =  pd.DataFrame([{
                        'full_name': row['full_name'], 
                        'language': row['language'], 
                        'package': None
                    }])
                resp.append(r)

            except:
                #print(row['full_name'], row['language'], arq)
                pass


#manager_file
#manager_package
resp

neo4j/neo4j
jenkinsci/jenkins
deeplearning4j/deeplearning4j
apache/hadoop
spring-projects/spring-boot
apache/flink
apache/incubator-shardingsphere
prestodb/presto
stanfordnlp/CoreNLP
libgdx/libgdx
dbeaver/dbeaver
eugenp/tutorials
naver/pinpoint
apache/storm
netty/netty
apache/incubator-druid
eclipse/che
dianping/cat
antlr/antlr4
b3log/symphony
alibaba/druid


[     full_name language package
 0  neo4j/neo4j     Java    None,            full_name language package
 0  jenkinsci/jenkins     Java   junit,            full_name language       package
 0  jenkinsci/jenkins     Java  mockito-core,            full_name language       package
 0  jenkinsci/jenkins     Java  mockito-core,                        full_name language package
 0  deeplearning4j/deeplearning4j     Java   junit,                        full_name language    package
 0  deeplearning4j/deeplearning4j     Java  scalatest,                        full_name language    package
 0  deeplearning4j/deeplearning4j     Java  scalatest,        full_name language package
 0  apache/hadoop     Java    None,                      full_name language package
 0  spring-projects/spring-boot     Java    None,       full_name language package
 0  apache/flink     Java   junit,       full_name language       package
 0  apache/flink     Java  mockito-core,       full_name language    package
 0  a

In [11]:
package_df = pd.concat(resp, ignore_index=True)

In [12]:
package_df.shape

(50, 3)

In [13]:
package_df = package_df.drop_duplicates(subset=['full_name', 'language', 'package'])

In [14]:
package_df.shape

(36, 3)

In [25]:
package_df.to_csv('dataset/miner_package.csv', sep=';', encoding = 'utf-8')

In [98]:
manager_package

{'C#': {'CoreClr': 0, 'NUnit': 0, 'NUnit3TestAdapter': 0},
 'Go': {'Testify': 0,
  'assert': 48,
  'biff': 0,
  'check': 60,
  'convey': 48,
  'e2e': 24,
  'ginkgo': 60,
  'go-check': 16,
  'go-testing-interface': 36,
  'gocheck': 4,
  'gomega': 64,
  'gucumber': 0,
  'require': 0,
  'spew': 65,
  'suite': 4,
  'testing': 55,
  'testutil': 4},
 'Java': {'EasyMock': 0,
  'hamcrest-all': 12,
  'junit': 70,
  'mockito-all': 4,
  'mockito-core': 28,
  'scalatest': 8,
  'spring-test': 8,
  'testNG': 0},
 'JavaScript': {'Appium': 0,
  'Cypress': 0,
  'Electron': 0,
  'Gemini': 4,
  'Happo': 0,
  'Istanbul': 0,
  'Nightwatch': 0,
  'Percy': 0,
  'PhantomJS': 0,
  'Protractor': 0,
  'Puppeteer': 0,
  'Sinon.JS': 0,
  'TestCafe': 0,
  'Unexpected': 0,
  'ava': 82,
  'chai': 105,
  'cucumber': 0,
  'cypress': 4,
  'enzyme': 48,
  'enzyme-adapter-react-16': 36,
  'enzyme-adapter-utils': 4,
  'jest': 92,
  'jest-cli': 44,
  'jsdom': 59,
  'json-server': 0,
  'karma': 81,
  'mocha': 126,
  'nyc': 4

In [13]:
manager_file

{'C#': {'total': 13, 'yes': 7},
 'Go': {'total': 33, 'yes': 16},
 'Java': {'total': 38, 'yes': 21},
 'JavaScript': {'total': 81, 'yes': 73},
 'PHP': {'total': 21, 'yes': 19},
 'Python': {'total': 52, 'yes': 16},
 'Ruby': {'total': 23, 'yes': 17},
 'TypeScript': {'total': 15, 'yes': 15}}

In [99]:
import requests
response = requests.get('https://raw.githubusercontent.com/octokit/octokit.rb/master/Gemfile')
response.encoding = 'utf-8'
texto = response.text
print(texto)
print(texto.count("gemspec"))
#file_response = open("dataset/manager_file/teste.txt","w+")
#file_response.write(texto, "w+")
#file_response.close()


KeyboardInterrupt: 

IOError: [Errno 2] File miner_test.csv does not exist: 'miner_test.csv'