In [7]:
import pandas as pd
from transformers import pipeline

ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
from markdown import Markdown
from io import StringIO
from bs4 import BeautifulSoup

def unmark_element(element, stream=None):
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False


def unmark(text):
    return __md.convert(text)

In [24]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text()
    return clean_text

In [25]:
def remove_excessive_newlines(text):
    cleaned_text = re.sub(r'\n+', '\n', text)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [26]:
def summarize_text(text, max_length=130, min_length=30):
    summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
    try:
        summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    except Exception as e:
        print("Cannot Summarize")
        return "No summary"
    
    return summary[0]['summary_text']

In [27]:
def summarize_helper(readme_file):
    with open(readme_file, "r", encoding='utf-8') as file:
        readme_content = file.read()

    plain_text_content = markdown_to_text(readme_content)
    
    summary = summarize_text(plain_text_content)
    
    return {'original_text': plain_text_content, 'summary': summary}

In [41]:
import pandas as pd

def tag_readme(readme_file):
    with open(readme_file, "r", encoding='utf-8') as file:
        readme_content = file.read()
    
    plain_text_content = remove_excessive_newlines(readme_content)
    plain_text_content = unmark(plain_text_content)
    plain_text_content = remove_html_tags(plain_text_content)

    print(plain_text_content)
    
    return plain_text_content
   
#     ner_results = ner_model(plain_text_content)

#     entities = {}
#     for result in ner_results:
#         entity = result['entity']
#         word = result['word']
#         if entity not in entities:
#             entities[entity] = {}
#         if word not in entities[entity]:
#             entities[entity][word] = 0
#         entities[entity][word] += 1

#     entity_totals = {entity: sum(words.values()) for entity, words in entities.items()}

#     detailed_data = []
#     for entity, words in entities.items():
#         for word, count in words.items():
#             detailed_data.append({'Entity': entity, 'Word': word, 'Word Count': count, 'Total Count': entity_totals[entity]})

  
#     detailed_df = pd.DataFrame(detailed_data)

#     total_data = [{'Entity': entity, 'Total Count': total_count} for entity, total_count in entity_totals.items()]

#     total_df = pd.DataFrame(total_data)
#     return [detailed_df, total_df]

In [42]:
import os
import shutil
import re

repos_path = os.path.join('data', 'repos', 'simulation')
analyzed_repos_path = os.path.join('data', 'analyzed_repos', 'simulation')
dj_file_path = os.path.join('apps', 'DJ', 'DesigniteJava.jar')


count = 0
for folder in os.listdir(repos_path):
    folder_path = os.path.join(repos_path, folder)
    readme_file = os.path.join(folder_path, 'README.md')
    
    if os.path.exists(readme_file):
        dfs = tag_readme(readme_file)
        
        break
        df = dfs[0]
        ndf = dfs[1]
        df.to_csv(os.path.join(folder_path, "entities.csv"), index=False)
        ndf.to_csv(os.path.join(folder_path, "total_entities.csv"), index=False)
        print("entities written to ", folder_path)
    else:
        try:
            shutil.rmtree(folder_path)
            count +=1
            print(f"Folder '{folder_path}' has been deleted")
        except OSError as e:
            print(f"Error: {e}. Folder '{folder_path}' is not empty or cannot be deleted")

print("Total folders deleted", count)

[![slack](https://img.shields.io/badge/slack-aerie-brightgreen?logo=slack)](https://join.slack.com/t/nasa-ammos/shared_invite/zt-1mlgmk5c2-MgqVSyKzVRUWrXy87FNqPw)
<br>
<div align="center">
  <img alt="Aerie" height="85" src="docs/img/aerie-wordmark-with-background.svg">
</div>
<br>
Aerie is a software framework for modeling spacecraft. Its main features include:
- A Java-based mission modeling library
- A discrete-event simulator
- An embedded TypeScript DSL for defining and executing scheduling goals
- An embedded TypeScript DSL for defining and executing constraints
- An embedded TypeScript DSL for defining and executing activity command expansions
- An embedded TypeScript DSL for defining sequences
- A [GraphQL API](https://nasa-ammos.github.io/aerie-docs/api/introduction)
- A web-based [client application][ui-repo]
## Getting Started
To get started using Aerie for the first time please do our [fast track tutorial][fast-track] on our documentation website.
## Need Help?
- Join us on

In [18]:
import os
import shutil 

repos_path = os.path.join('data', 'repos', 'simulation')

summaries = []
for folder in os.listdir(repos_path):
    folder_path = os.path.join(repos_path, folder)
    readme_file = os.path.join(folder_path, 'README.md')
    
    if os.path.exists(readme_file):
        summary = summarize_helper(readme_file)
        summaries.append(summary)
        print("The readme is summarized for ", folder_path)
    else:
        try:
            shutil.rmtree(folder_path)
            count +=1
            print(f"Folder '{folder_path}' has been deleted")
        except OSError as e:
            print(f"Error: {e}. Folder '{folder_path}' is not empty or cannot be deleted")
    

print("Total folders deleted", count)
df = pd.df = pd.DataFrame(summaries)
df.to_csv('summaries_sim.csv')
df

The readme is summarized for  data\repos\simulation\adf-sample-agent-java
The readme is summarized for  data\repos\simulation\aerie
The readme is summarized for  data\repos\simulation\AgentWorkbench


Token indices sequence length is longer than the specified maximum sequence length for this model (1081 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\amod


Token indices sequence length is longer than the specified maximum sequence length for this model (2039 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\amodeus
The readme is summarized for  data\repos\simulation\annotation-simulator
The readme is summarized for  data\repos\simulation\AutoInteraction-Library
The readme is summarized for  data\repos\simulation\Black-Scholes-Option-Pricing-Model


Token indices sequence length is longer than the specified maximum sequence length for this model (3211 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\BoomChess-Android


Token indices sequence length is longer than the specified maximum sequence length for this model (1200 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\cas


Token indices sequence length is longer than the specified maximum sequence length for this model (4070 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\cloudsimplus


Token indices sequence length is longer than the specified maximum sequence length for this model (1598 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\cloudsimplus-automation
The readme is summarized for  data\repos\simulation\cloudsimplus-examples


Token indices sequence length is longer than the specified maximum sequence length for this model (3521 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\cloudsimsdn


Token indices sequence length is longer than the specified maximum sequence length for this model (3922 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\computer_kurzweil


Token indices sequence length is longer than the specified maximum sequence length for this model (18523 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\DesSim


Token indices sequence length is longer than the specified maximum sequence length for this model (1601 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\ecosim


Your max_length is set to 130, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


The readme is summarized for  data\repos\simulation\end-game
The readme is summarized for  data\repos\simulation\eqasim-java


Token indices sequence length is longer than the specified maximum sequence length for this model (1640 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\EVLib


Token indices sequence length is longer than the specified maximum sequence length for this model (2169 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\Evolving-Protozoa


Token indices sequence length is longer than the specified maximum sequence length for this model (2270 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\finmath-lib


Token indices sequence length is longer than the specified maximum sequence length for this model (1389 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\FlockingSimulation
The readme is summarized for  data\repos\simulation\gama


Token indices sequence length is longer than the specified maximum sequence length for this model (1223 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\gips


Token indices sequence length is longer than the specified maximum sequence length for this model (1850 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\iflye
The readme is summarized for  data\repos\simulation\imp_city
The readme is summarized for  data\repos\simulation\JSimpleSim
The readme is summarized for  data\repos\simulation\Logisim


Token indices sequence length is longer than the specified maximum sequence length for this model (5217 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\mars-sim


Token indices sequence length is longer than the specified maximum sequence length for this model (2392 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\MATSim-UAM
The readme is summarized for  data\repos\simulation\microtrafficsim


Token indices sequence length is longer than the specified maximum sequence length for this model (4869 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\Mirage
The readme is summarized for  data\repos\simulation\MiSim


Token indices sequence length is longer than the specified maximum sequence length for this model (1221 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\mosaic


Token indices sequence length is longer than the specified maximum sequence length for this model (6068 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\multiphysics
The readme is summarized for  data\repos\simulation\NeuGen


Token indices sequence length is longer than the specified maximum sequence length for this model (1256 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\ORBrake
The readme is summarized for  data\repos\simulation\PAA-SimTool


Token indices sequence length is longer than the specified maximum sequence length for this model (2273 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\Pandemic-Simulator
The readme is summarized for  data\repos\simulation\pedroEngine
The readme is summarized for  data\repos\simulation\plg
The readme is summarized for  data\repos\simulation\PowerSystemDataModel


Token indices sequence length is longer than the specified maximum sequence length for this model (6016 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\PureEdgeSim


Token indices sequence length is longer than the specified maximum sequence length for this model (12878 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\QSPcc


Token indices sequence length is longer than the specified maximum sequence length for this model (1341 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\react-native-ble-peripheral


Token indices sequence length is longer than the specified maximum sequence length for this model (6116 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\riscj-computer-mod
The readme is summarized for  data\repos\simulation\Robot-Overlord-App
The readme is summarized for  data\repos\simulation\SASS
The readme is summarized for  data\repos\simulation\ShapeOfThingsThatWere
The readme is summarized for  data\repos\simulation\simulated-evolution


Token indices sequence length is longer than the specified maximum sequence length for this model (2965 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\simulation-construction-set-2


Token indices sequence length is longer than the specified maximum sequence length for this model (1092 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\sinalgo
The readme is summarized for  data\repos\simulation\singa
The readme is summarized for  data\repos\simulation\skiverse


Token indices sequence length is longer than the specified maximum sequence length for this model (2943 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\smockin


Your max_length is set to 130, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


The readme is summarized for  data\repos\simulation\supermarket-simulator


Token indices sequence length is longer than the specified maximum sequence length for this model (1292 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\synthea
The readme is summarized for  data\repos\simulation\tko-electronics-sim
The readme is summarized for  data\repos\simulation\TraficCERN


Token indices sequence length is longer than the specified maximum sequence length for this model (1169 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\simulation\TraJ
The readme is summarized for  data\repos\simulation\tumor-growth-simulation
The readme is summarized for  data\repos\simulation\universe-maker-app


Your max_length is set to 130, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


The readme is summarized for  data\repos\simulation\varsim
The readme is summarized for  data\repos\simulation\workcraft
Total folders deleted 0


Unnamed: 0,original_text,summary
0,adf-sample-agent-java Agent Development Framew...,The adf-sample-agent-java is a sample team imp...
1,\n\n\n\n\n\nAerie is a software framework for ...,Aerie is a software framework for modeling spa...
2,Agent.Workbench\nThe documentaton is also avai...,Agent.Workbench is an Eclipse / OSGI based app...
3,amodeus.amod \nThis repository allows to run a...,No summary
4,amodeus.amodeus \nAutonomous mobility-on-deman...,No summary
...,...,...
60,\n\n\n\nTraJ\nJava library for diffusion traj...,No summary
61,Tumor growth simulation\nTumor growth simulati...,Tumor growth simulation using 2D cellular auto...
62,Universe Maker | Planetary simulation game\n\n...,Universe Maker is an n-body simulation of a pl...
63,VarSim: A high-fidelity simulation validation ...,Varsim is a high-fidelity simulation validatio...


In [19]:
import os
import shutil 

repos_path = os.path.join('data', 'repos', 'traditional')

summaries = []
for folder in os.listdir(repos_path):
    folder_path = os.path.join(repos_path, folder)
    readme_file = os.path.join(folder_path, 'README.md')
    
    if os.path.exists(readme_file):
        summary = summarize_helper(readme_file)
        summaries.append(summary)
        print("The readme is summarized for ", folder_path)
    else:
        try:
            shutil.rmtree(folder_path)
            count +=1
            print(f"Folder '{folder_path}' has been deleted")
        except OSError as e:
            print(f"Error: {e}. Folder '{folder_path}' is not empty or cannot be deleted")
    

print("Total folders deleted", count)
df = pd.df = pd.DataFrame(summaries)
df.to_csv('summaries_trad.csv')
df

The readme is summarized for  data\repos\traditional\alchemy


Token indices sequence length is longer than the specified maximum sequence length for this model (3359 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\android-http-server
The readme is summarized for  data\repos\traditional\Android-Web-Server


Your max_length is set to 130, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


The readme is summarized for  data\repos\traditional\AndroidWebAutoLaunch
The readme is summarized for  data\repos\traditional\AnywhereAlertConfirm
The readme is summarized for  data\repos\traditional\APKMirror


Token indices sequence length is longer than the specified maximum sequence length for this model (1149 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\app


Token indices sequence length is longer than the specified maximum sequence length for this model (1134 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\Arkhota


Token indices sequence length is longer than the specified maximum sequence length for this model (4193 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\automotion-java
The readme is summarized for  data\repos\traditional\avaje-jex
The readme is summarized for  data\repos\traditional\botwall4j


Token indices sequence length is longer than the specified maximum sequence length for this model (7871 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\boyka-framework
The readme is summarized for  data\repos\traditional\bring


Your max_length is set to 130, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)


The readme is summarized for  data\repos\traditional\BUbiNG
The readme is summarized for  data\repos\traditional\buildvu-microservice-example


Token indices sequence length is longer than the specified maximum sequence length for this model (1415 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\capacitor-firebase
The readme is summarized for  data\repos\traditional\CatLogging


Token indices sequence length is longer than the specified maximum sequence length for this model (2509 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\chaos-server


Token indices sequence length is longer than the specified maximum sequence length for this model (3922 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\dataviewer
The readme is summarized for  data\repos\traditional\dicom-web-pacs


Token indices sequence length is longer than the specified maximum sequence length for this model (2062 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\DiscoveryDesktop


Token indices sequence length is longer than the specified maximum sequence length for this model (47553 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\DiskMirror
The readme is summarized for  data\repos\traditional\dotenv


Token indices sequence length is longer than the specified maximum sequence length for this model (3337 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\e-commerce
The readme is summarized for  data\repos\traditional\espresso


Token indices sequence length is longer than the specified maximum sequence length for this model (1715 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\fili


Token indices sequence length is longer than the specified maximum sequence length for this model (1394 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\firefly


Token indices sequence length is longer than the specified maximum sequence length for this model (1136 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\Gamp-Forest-site
The readme is summarized for  data\repos\traditional\gcn-intellij-tooling
The readme is summarized for  data\repos\traditional\gwt-ol
The readme is summarized for  data\repos\traditional\Hacktoberfest-2022


Token indices sequence length is longer than the specified maximum sequence length for this model (1082 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\HtmlNative


Token indices sequence length is longer than the specified maximum sequence length for this model (1582 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\java-reverse-tcp
The readme is summarized for  data\repos\traditional\java-web-exercises


Token indices sequence length is longer than the specified maximum sequence length for this model (4165 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\javafx-dataviewer-wrapper
The readme is summarized for  data\repos\traditional\JCrawl


Token indices sequence length is longer than the specified maximum sequence length for this model (1412 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\Jerrydog


Token indices sequence length is longer than the specified maximum sequence length for this model (1566 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\JumpGo-for-Android
The readme is summarized for  data\repos\traditional\Learning_platform


Token indices sequence length is longer than the specified maximum sequence length for this model (9456 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\Liberg


Token indices sequence length is longer than the specified maximum sequence length for this model (2638 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\linwin-sploit


Token indices sequence length is longer than the specified maximum sequence length for this model (1487 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\log-requests-to-sqlite


Your max_length is set to 130, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


The readme is summarized for  data\repos\traditional\log_generator


Token indices sequence length is longer than the specified maximum sequence length for this model (6984 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\LoopAuth


Your max_length is set to 130, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


The readme is summarized for  data\repos\traditional\Lucid-Browser
The readme is summarized for  data\repos\traditional\magic-starter
The readme is summarized for  data\repos\traditional\mailspider
The readme is summarized for  data\repos\traditional\metl
The readme is summarized for  data\repos\traditional\modern-app-dev


Token indices sequence length is longer than the specified maximum sequence length for this model (8427 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\molecule


Token indices sequence length is longer than the specified maximum sequence length for this model (1139 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\MotoGP-API
The readme is summarized for  data\repos\traditional\ng-objects


Your max_length is set to 130, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


The readme is summarized for  data\repos\traditional\OpenChess
The readme is summarized for  data\repos\traditional\OpenKettleWebUI
The readme is summarized for  data\repos\traditional\openlibs.easywebframework


Token indices sequence length is longer than the specified maximum sequence length for this model (4308 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\openLinwin


Token indices sequence length is longer than the specified maximum sequence length for this model (2524 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\parsec
The readme is summarized for  data\repos\traditional\parsec-libraries


Token indices sequence length is longer than the specified maximum sequence length for this model (5831 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\ph-oton


Token indices sequence length is longer than the specified maximum sequence length for this model (1146 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\qaf
The readme is summarized for  data\repos\traditional\quarkus-primefaces
The readme is summarized for  data\repos\traditional\quarkus-quinoa
The readme is summarized for  data\repos\traditional\quarkus-web-bundler
The readme is summarized for  data\repos\traditional\react-native-social-fab


Token indices sequence length is longer than the specified maximum sequence length for this model (3556 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\redant


Token indices sequence length is longer than the specified maximum sequence length for this model (1307 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\reqn
The readme is summarized for  data\repos\traditional\retail-banking


Token indices sequence length is longer than the specified maximum sequence length for this model (2329 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\rife2


Token indices sequence length is longer than the specified maximum sequence length for this model (3614 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\riptide
The readme is summarized for  data\repos\traditional\RiseLoader
The readme is summarized for  data\repos\traditional\routekit
The readme is summarized for  data\repos\traditional\rtsp-websocket-server


Your max_length is set to 130, but your input_length is only 64. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


The readme is summarized for  data\repos\traditional\seed


Token indices sequence length is longer than the specified maximum sequence length for this model (1844 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\Setool-Main


Token indices sequence length is longer than the specified maximum sequence length for this model (1143 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\sfe4j


Token indices sequence length is longer than the specified maximum sequence length for this model (2126 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\shoulder-framework
The readme is summarized for  data\repos\traditional\simplejmx
The readme is summarized for  data\repos\traditional\skeleton-starter-flow
The readme is summarized for  data\repos\traditional\skeleton-starter-flow-spring


Token indices sequence length is longer than the specified maximum sequence length for this model (1594 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\SORMAS-Project


Token indices sequence length is longer than the specified maximum sequence length for this model (1442 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\spring-s3-properties-loader


Your max_length is set to 130, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)


The readme is summarized for  data\repos\traditional\swim-java-bindings
The readme is summarized for  data\repos\traditional\teachingCodeRepo


Your max_length is set to 130, but your input_length is only 128. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)


The readme is summarized for  data\repos\traditional\urlpattern
The readme is summarized for  data\repos\traditional\UserCenter
The readme is summarized for  data\repos\traditional\venom


Token indices sequence length is longer than the specified maximum sequence length for this model (7283 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\vertx-ddns


Token indices sequence length is longer than the specified maximum sequence length for this model (1089 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\vtm


Token indices sequence length is longer than the specified maximum sequence length for this model (2049 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\w2j-cli
The readme is summarized for  data\repos\traditional\web-budget


Token indices sequence length is longer than the specified maximum sequence length for this model (4032 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\Web-DICOM-Management


Token indices sequence length is longer than the specified maximum sequence length for this model (4304 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\WebAndAppUITesting
The readme is summarized for  data\repos\traditional\webby
The readme is summarized for  data\repos\traditional\webinloop
The readme is summarized for  data\repos\traditional\WeBlocker
The readme is summarized for  data\repos\traditional\webrest-starter


Token indices sequence length is longer than the specified maximum sequence length for this model (4493 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\WebSocketaaS


Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\wicket-iziToast
The readme is summarized for  data\repos\traditional\Wicket-tutorial-examples


Token indices sequence length is longer than the specified maximum sequence length for this model (3764 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\wrappedelements


Token indices sequence length is longer than the specified maximum sequence length for this model (3230 > 1024). Running this sequence through the model will result in indexing errors


Cannot Summarize
The readme is summarized for  data\repos\traditional\YangMVC
Total folders deleted 0


Unnamed: 0,original_text,summary
0,\nAlchemy:\n\nalchemyÊòØ‰ª•Jhipster‰∏∫ËÑöÊâãÊû∂ÂºÄÂèëÁöÑwebÁ≥ªÁªüÔºåËÉΩÂ§ü...,Alchemy is a web-based web app. It uses the Fl...
1,Android HTTP Server\nSmall but powerful multit...,No summary
2,Android Web Server (FireFly)\nSimple and Small...,Android Web Server (FireFly) is a TCP/IP Web S...
3,Auto Web Launcher\nA simple App to launch webp...,A simple App to launch webpages at full screen...
4,React Native Anywhere Alert Confirm\nOverview\...,React Native Anywhere Alert Confirm provides a...
...,...,...
96,Introduction\nThe project is the backend solut...,No summary
97,wicket-iziToast\nApache Wicket utilities for u...,No summary
98,Wicket-tutorial-examples\nThis repository cont...,Wicket-tutorial-examples contains the example ...
99,wrappedelements\n\n\n\nA convenient framework ...,No summary


In [5]:
import os
import shutil 

repos_path = os.path.join('data', 'repos', 'traditional')
analyzed_repos_path = os.path.join('data', 'analyzed_repos', 'traditional')
dj_file_path = os.path.join('apps', 'DJ', 'DesigniteJava.jar')


count = 0
for folder in os.listdir(repos_path):
    folder_path = os.path.join(repos_path, folder)
    readme_file = os.path.join(folder_path, 'README.md')
    
    if os.path.exists(readme_file):
        df = tag_readme(readme_file)
        df.to_csv(os.path.join(folder_path, "entities.csv"), index=False)
        print("entities written to ", folder_path)
    else:
        try:
            shutil.rmtree(folder_path)
            count +=1
            print(f"Folder '{folder_path}' has been deleted")
        except OSError as e:
            print(f"Error: {e}. Folder '{folder_path}' is not empty or cannot be deleted")

print("Total folders deleted", count)

AttributeError: 'list' object has no attribute 'to_csv'

In [49]:
import os
import shutil 

repos_path = os.path.join('data', 'repos', 'traditional')
cat_path = os.path.join('data', 'categories', 'traditional')

for folder in os.listdir(repos_path):
    folder_path = os.path.join(repos_path, folder)
    entity_path = os.path.join(folder_path, 'entities.csv')
    
    try:
        entity = pd.read_csv(entity_path)
    except Exception as e:
        print("An error occurred:", e, " at ", folder_path)
        continue

    max_row_index = df['Total Count'].idxmax()
    max_row = df.iloc[max_row_index]
    
    entity_folder = os.path.join(cat_path, max_row['Entity'])
    
    if not os.path.exists(entity_folder):
        os.mkdir(entity_folder)
    
    new_folder_path = os.path.join(entity_folder, folder)
    
    if os.path.exists(new_folder_path):
        print(new_folder_path, " Already present")
        continue
    
    try:
        shutil.copytree(folder_path, new_folder_path)
    except OSError as e:
        print(f"Error: {e}. Folder '{folder_path}' cannot be copied")

        
    print("Copied ", folder_path, "to ", new_folder_path)
    

data\categories\traditional\I-ORG\alchemy  Already present
data\categories\traditional\I-ORG\android-http-server  Already present
data\categories\traditional\I-ORG\Android-Web-Server  Already present
data\categories\traditional\I-ORG\AndroidWebAutoLaunch  Already present
data\categories\traditional\I-ORG\AnywhereAlertConfirm  Already present
data\categories\traditional\I-ORG\APKMirror  Already present
data\categories\traditional\I-ORG\app  Already present
data\categories\traditional\I-ORG\Arkhota  Already present
data\categories\traditional\I-ORG\automotion-java  Already present
data\categories\traditional\I-ORG\avaje-jex  Already present
data\categories\traditional\I-ORG\botwall4j  Already present
data\categories\traditional\I-ORG\boyka-framework  Already present
data\categories\traditional\I-ORG\bring  Already present
data\categories\traditional\I-ORG\BUbiNG  Already present
data\categories\traditional\I-ORG\buildvu-microservice-example  Already present
data\categories\traditional\I-

Copied  data\repos\traditional\simplejmx to  data\categories\traditional\I-ORG\simplejmx
Copied  data\repos\traditional\skeleton-starter-flow to  data\categories\traditional\I-ORG\skeleton-starter-flow
Copied  data\repos\traditional\skeleton-starter-flow-spring to  data\categories\traditional\I-ORG\skeleton-starter-flow-spring
Copied  data\repos\traditional\SORMAS-Project to  data\categories\traditional\I-ORG\SORMAS-Project
Copied  data\repos\traditional\spring-s3-properties-loader to  data\categories\traditional\I-ORG\spring-s3-properties-loader
Copied  data\repos\traditional\swim-java-bindings to  data\categories\traditional\I-ORG\swim-java-bindings
Copied  data\repos\traditional\teachingCodeRepo to  data\categories\traditional\I-ORG\teachingCodeRepo
Copied  data\repos\traditional\urlpattern to  data\categories\traditional\I-ORG\urlpattern
An error occurred: No columns to parse from file  at  data\repos\traditional\UserCenter
Copied  data\repos\traditional\venom to  data\categories\t

In [50]:
import os
import shutil 

repos_path = os.path.join('data', 'repos', 'simulation')
cat_path = os.path.join('data', 'categories', 'simulation')

for folder in os.listdir(repos_path):
    folder_path = os.path.join(repos_path, folder)
    entity_path = os.path.join(folder_path, 'entities.csv')
    
    try:
        entity = pd.read_csv(entity_path)
    except Exception as e:
        print("An error occurred:", e, " at ", folder_path)
        continue

    max_row_index = df['Total Count'].idxmax()
    max_row = df.iloc[max_row_index]
    
    entity_folder = os.path.join(cat_path, max_row['Entity'])
    
    if not os.path.exists(entity_folder):
        os.mkdir(entity_folder)
    
    new_folder_path = os.path.join(entity_folder, folder)
    
    if os.path.exists(new_folder_path):
        print(new_folder_path, " Already present")
        continue
    
    try:
        shutil.copytree(folder_path, new_folder_path)
    except OSError as e:
        print(f"Error: {e}. Folder '{folder_path}' cannot be copied")

        
    print("Copied ", folder_path, "to ", new_folder_path)

Copied  data\repos\simulation\adf-sample-agent-java to  data\categories\simulation\I-ORG\adf-sample-agent-java
Copied  data\repos\simulation\aerie to  data\categories\simulation\I-ORG\aerie
Error: [('data\\repos\\simulation\\AgentWorkbench\\eclipseProjects\\de.enflexit.awb\\bundles\\de.enflexit.awb.ws.restapi\\xCodgen\\target\\generated-sources\\openapi\\src\\gen\\java\\de\\enflexit\\awb\\ws\\restapi\\gen\\InstallationDetailsApiService.java', 'data\\categories\\simulation\\I-ORG\\AgentWorkbench\\eclipseProjects\\de.enflexit.awb\\bundles\\de.enflexit.awb.ws.restapi\\xCodgen\\target\\generated-sources\\openapi\\src\\gen\\java\\de\\enflexit\\awb\\ws\\restapi\\gen\\InstallationDetailsApiService.java', "[Errno 2] No such file or directory: 'data\\\\categories\\\\simulation\\\\I-ORG\\\\AgentWorkbench\\\\eclipseProjects\\\\de.enflexit.awb\\\\bundles\\\\de.enflexit.awb.ws.restapi\\\\xCodgen\\\\target\\\\generated-sources\\\\openapi\\\\src\\\\gen\\\\java\\\\de\\\\enflexit\\\\awb\\\\ws\\\\restap

Copied  data\repos\simulation\amod to  data\categories\simulation\I-ORG\amod
Copied  data\repos\simulation\amodeus to  data\categories\simulation\I-ORG\amodeus
Copied  data\repos\simulation\annotation-simulator to  data\categories\simulation\I-ORG\annotation-simulator
Copied  data\repos\simulation\AutoInteraction-Library to  data\categories\simulation\I-ORG\AutoInteraction-Library
Copied  data\repos\simulation\Black-Scholes-Option-Pricing-Model to  data\categories\simulation\I-ORG\Black-Scholes-Option-Pricing-Model
Copied  data\repos\simulation\BoomChess-Android to  data\categories\simulation\I-ORG\BoomChess-Android
Copied  data\repos\simulation\cas to  data\categories\simulation\I-ORG\cas
Copied  data\repos\simulation\cloudsimplus to  data\categories\simulation\I-ORG\cloudsimplus
Copied  data\repos\simulation\cloudsimplus-automation to  data\categories\simulation\I-ORG\cloudsimplus-automation
Copied  data\repos\simulation\cloudsimplus-examples to  data\categories\simulation\I-ORG\clou

In [5]:
import pandas as pd

a = pd.read_csv("C:/Users/riasa/Downloads/Most starred Github Repositories.csv")
a = a[a['language'] == 'Java']
a

Unnamed: 0,rank,item,repo_name,stars,forks,language,repo_url,username,issues,last_commit,description
13,14,top-100-stars,CS-Notes,134809,43285,Java,https://github.com/CyC2018/CS-Notes,CyC2018,105,2021-06-30T00:42:05Z,:books: ÊäÄÊúØÈù¢ËØïÂøÖÂ§áÂü∫Á°ÄÁü•ËØÜ„ÄÅLeetcode„ÄÅËÆ°ÁÆóÊú∫Êìç‰ΩúÁ≥ªÁªü„ÄÅËÆ°ÁÆóÊú∫ÁΩëÁªú„ÄÅÁ≥ªÁªüËÆæËÆ°
23,24,top-100-stars,JavaGuide,107379,36867,Java,https://github.com/Snailclimb/JavaGuide,Snailclimb,54,2021-07-13T15:35:53Z,„ÄåJavaÂ≠¶‰π†+Èù¢ËØïÊåáÂçó„Äç‰∏Ä‰ªΩÊ∂µÁõñÂ§ßÈÉ®ÂàÜ Java Á®ãÂ∫èÂëòÊâÄÈúÄË¶ÅÊéåÊè°ÁöÑÊ†∏ÂøÉÁü•ËØÜ„ÄÇÂáÜÂ§á Jav...
51,52,top-100-stars,java-design-patterns,68846,21461,Java,https://github.com/iluwatar/java-design-patterns,iluwatar,268,2021-07-12T13:48:25Z,Design patterns implemented in Java
56,57,top-100-stars,LeetCodeAnimation,65586,12545,Java,https://github.com/MisterBooo/LeetCodeAnimation,MisterBooo,10,2021-06-24T03:37:31Z,Demonstrate all the questions on LeetCode in t...
72,73,top-100-stars,spring-boot,56209,34038,Java,https://github.com/spring-projects/spring-boot,spring-projects,500,2021-07-13T22:19:54Z,Spring Boot
...,...,...,...,...,...,...,...,...,...,...,...
1295,96,Java,Arduino,11892,6891,Java,https://github.com/arduino/Arduino,arduino,862,2021-06-24T03:28:45Z,open-source electronics platform
1296,97,Java,AndroidViewAnimations,11864,2399,Java,https://github.com/daimajia/AndroidViewAnimations,daimajia,60,2021-06-05T04:58:04Z,Cute view animation collection.
1297,98,Java,JustAuth,11782,2099,Java,https://github.com/justauth/JustAuth,justauth,27,2021-07-06T15:53:10Z,üèÜGitee ÊúÄÊúâ‰ª∑ÂÄºÂºÄÊ∫êÈ°πÁõÆ üöÄ:100: Â∞èËÄåÂÖ®ËÄåÁæéÁöÑÁ¨¨‰∏âÊñπÁôªÂΩïÂºÄÊ∫êÁªÑ‰ª∂„ÄÇÁõÆÂâçÂ∑≤ÊîØÊåÅGi...
1298,99,Java,hadoop,11766,7286,Java,https://github.com/apache/hadoop,apache,442,2021-07-14T01:27:16Z,Apache Hadoop
