## Connect to Google Cloud
#### Skip this part if you want to manually improt the dataset

In [None]:
from google.colab.auth import authenticate_user

In [None]:
authenticate_user()

In [None]:
path = "gs://nlp_599/data/docstring.csv"

In [None]:
!gsutil cp $path ./docstring.csv

Copying gs://nlp_599/data/docstring.csv...
- [1 files][ 92.4 MiB/ 92.4 MiB]                                                
Operation completed over 1 objects/92.4 MiB.                                     


## Install all dependencies and packages

In [5]:
!pip install datasets
!pip install transformers
!pip install pytorch_lightning
# Basic data preprocess package
import pandas as pd
from pandas.core import strings
from sklearn.model_selection import train_test_split
from datasets import Dataset
# Torch an transformer
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from transformers import T5ForConditionalGeneration
# Import NLTK
import nltk 
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from typing import List, Tuple
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Connect to huggingface
from huggingface_hub import Repository
from pathlib import Path
import subprocess

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-1.8.4.post0-py3-none-any.whl (800 kB)
[K     |████████████████████████████████| 800 kB 4.2 MB/s 
Collecting tensorboardX>=2.2
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 76.9 MB/s 
Collecting lightning-utilities!=0.4.0,>=0.3.0
  Downloading lightning_utilities-0.4.2-py3-none-any.whl (16 kB)
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[K     |████████████████████████████████| 512 kB 94.2 MB/s 
Installing collected packages: torchmetrics, tensorboardX, lightning-utilities, pytorch-lightning
Successfully installed lightni

## Data set snippet

In [None]:
# If you dont have google cloud connected above, just mannualy import dataset.
dataset = pd.read_csv("docstring.csv")
dataset

Unnamed: 0,comment,code_words,doc_id,split_name,tokenized_code
0,&apos;Pulls all flashed messages from the sess...,def get _ flashed _ messages ( with _ categori...,train0,train,{def} {get} {flashed} {messages} {with} {categ...
1,&apos;Patch a resource DCNL : param id : the i...,"def resource _ patch ( context , data _ dict )...",train1,train,{def} {resource} {patch} {context} {data} {dic...
2,&apos;Reorders a test suite by test type . DCN...,"def reorder _ suite ( suite , classes , revers...",train2,train,{def} {reorder} {suite} {suite} {classes} {rev...
3,&apos;Delete dhcp options by id or name . DCNL...,def delete _ dhcp _ options ( dhcp _ options _...,train3,train,{def} {delete} {dhcp} {options} {dhcp} {option...
4,&apos;Yield images of the laplacian pyramid fo...,"def pyramid _ laplacian ( image , max _ layer ...",train4,train,{def} {pyramid} {laplacian} {image} {max} {lay...
...,...,...,...,...,...
70855,&apos; : return : A string with $ length % s a...,def create _ format _ string ( length ) : DCNL...,train70855,train,{def} {create} {format} {string} {length} {:} ...
70856,&apos;Construct a L { Team } that spawns threa...,"def pool ( currentLimit , thread@@ Factory = T...",train70856,train,{def} {pool} {current} {limit} {thread} {@} {@...
70857,&apos;Get all groups for a specific project _ ...,def instance _ group _ get _ all _ by _ projec...,train70857,train,{def} {instance} {group} {get} {all} {by} {pro...
70858,&apos;DEPRECATED . Please use one of nflgame.@...,"def combine ( games , plays = False ) : DCNL D...",train70858,train,{def} {combine} {games} {plays} {false} {:} {d...


### Split the training, testing and validation

In [None]:
training_data, test_data = train_test_split(dataset, test_size=0.2, random_state=25)
training_data, validating_data = train_test_split(training_data, test_size=0.15, random_state=25)

In [None]:
test_data.head()

Unnamed: 0,comment,code_words,doc_id,split_name,tokenized_code
19316,&apos;Get the number of CPU processes on the c...,def get _ num _ cpus ( ) : DCNL DCSP return mu...,train19316,train,{def} {get} {num} {cpus} {:} {dcnl} {dcsp} {re...
31156,&apos;In@@ fo page ( link from main header ) &...,def info ( request ) : DCNL DCSP return render...,train31156,train,{def} {info} {request} {:} {dcnl} {dcsp} {retu...
19086,&apos;A decorator that tests that the decorate...,def returns _ arg ( function ) : DCNL DCSP def...,train19086,train,{def} {returns} {arg} {function} {:} {dcnl} {d...
59453,&apos;Return a taxon identifier according to N...,"def saf@@ ename ( name , mrbayes = False ) : D...",train59453,train,{def} {saf} {@} {@} {ename} {name} {mrbayes} {...
61981,&apos;Return ``True`` if the url is a safe red...,"def is _ safe _ url ( url , host = None ) : DC...",train61981,train,{def} {is} {safe} {url} {url} {host} {none} {:...


## Use the codeT5 small pretrained model

In [None]:
# Tokenize the pretrained model
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

prefix = "Summarize: "
max_input_length = 256
max_target_length = 128

def preprocess_examples(examples):
  codes = examples['code_words']
  docstrings = examples['comment']
  
  inputs = [prefix + code for code in codes]
  model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

  labels = tokenizer(docstrings, max_length=max_target_length, padding="max_length", truncation=True).input_ids
  labels_with_ignore_index = []
  for labels_example in labels:
    labels_example = [label if label != 0 else -100 for label in labels_example]
    labels_with_ignore_index.append(labels_example)
  model_inputs["labels"] = labels_with_ignore_index

  return model_inputs

Downloading:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

In [None]:
training_data = Dataset.from_pandas(training_data)
testing_data = Dataset.from_pandas(test_data)
validating_data = Dataset.from_pandas(validating_data)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 4.6 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 90.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 76.9 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 91.5 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3

In [None]:
training_data.describe()

Dataset({
    features: ['comment', 'code_words', 'doc_id', 'split_name', 'tokenized_code', '__index_level_0__'],
    num_rows: 60231
})

In [None]:
testing_data.describe()

Dataset({
    features: ['comment', 'code_words', 'doc_id', 'split_name', 'tokenized_code', '__index_level_0__'],
    num_rows: 14172
})

In [None]:
validating_data.describe()

Dataset({
    features: ['comment', 'code_words', 'doc_id', 'split_name', 'tokenized_code', '__index_level_0__'],
    num_rows: 10629
})

In [None]:
training_data = training_data.map(preprocess_examples, batched=True)
testing_data = testing_data.map(preprocess_examples, batched=True)
validating_data = validating_data.map(preprocess_examples, batched=True)

  0%|          | 0/61 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [None]:
training_data.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
testing_data.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
validating_data.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

train_dataloader = DataLoader(training_data, shuffle=True, batch_size=8)
valid_dataloader = DataLoader(validating_data, batch_size=4)
test_dataloader = DataLoader(testing_data, batch_size=4)

batch = next(iter(train_dataloader))
print(batch.keys())

In [None]:
tokenizer.decode(batch['input_ids'][0])

'<s>Summarize: def delta _ import ( handler, host = None, core _ name = None, options = None, extra = None ) : DCNL DCSP options = ( { } if ( options is None ) else options ) DCNL DCSP extra = ( &#91; &#93; if ( extra is None ) else extra ) DCNL DCSP if ( ( not _ is _ master ( ) ) and ( _ get _ none _ or _ value ( host ) is None ) ) : DCNL DCSP DCSP err = &#91; &apos; solr.@@ delta _ import DCSP can DCSP only DCSP be DCSP called DCSP on DCSP &quot; master &quot; DCSP minions &apos; &#93; DCNL DCSP DCSP return _ get _ return _ dict ( False, errors = err ) DCNL DCSP resp = _ pre _ index _ check ( handler, host = host, core _ name = core _ name ) DCNL DCSP if ( not resp &#91; &apos; success &apos; &#93; ) : DCNL DCSP DCSP return resp DCNL DCSP options = _ merge</s>'

In [None]:
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

'<s>&apos;Submits an import command to the specified handler using specified options. DCNL This command can only be run if the minion is configured with DCNL solr.type = master DCNL handler : str DCNL The name of the data import handler. DCNL host : str ( None ) DCNL The solr host to query. _ _ opts _ _ &#91; \\ &apos; host \\ &apos; &#93; is default. DCNL core : str ( None ) DCNL The core the handler belongs to. DCNL options : dict (</s>'

### Fine-tune and pretrain

In [None]:
class CodeT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=15, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small")
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
      
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("training_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     

        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}
        
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader

In [None]:
model = CodeT5()

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [None]:
early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(gpus=1, 
                  default_root_dir="/content/drive/MyDrive/CodeT5/Notebooks/Checkpoints", 
                  callbacks=[early_stop_callback, lr_monitor])
trainer.fit(model)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
241.969   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Save model to local directory

In [None]:
save_directory = "." 
model.model.save_pretrained(save_directory)

In [None]:
model = T5ForConditionalGeneration.from_pretrained(save_directory)

In [None]:
test_example = test_data.iloc[3]
input_ids = tokenizer(test_example['code_words'], return_tensors='pt').input_ids

outputs = model.generate(input_ids)
print("Generated docstring:", tokenizer.decode(outputs[0], skip_special_tokens=True))



Generated docstring: &apos;Return the safe string for use in a shell. DCNL If


In [None]:
print("Ground truth:", test_example['comment'])

Ground truth: &apos;Return a taxon identifier according to N@@ EX@@ US standard . DCNL Wrap quotes around names with punctuation or whitespace , and double DCNL single quotes . DCNL mrbayes = True : write names without quotes , whitespace or punctuation DCNL for the mrbayes software package . &apos;


In [None]:
!sudo apt-get install git-lfs
#Replace "..." with your email address and github user name in the double quotes
!git config --global user.email "..."
!git config --global user.name "..."
!git config --global credential.helper store

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [None]:
# Use the link below to copy your token. Not useful on some notebook platforms like kaggle. 
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
    
Token: 
Add token as git credential? (Y/n) Y
Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
# In huggingface, create a new model first, and paste ***model card*** url below
repo_url = ""

In [None]:
#Replace "..." with your email address and github user name in the double quotes
repo = Repository(local_dir="checkpoint",
                  clone_from=repo_url,
                  git_user="...",
                  git_email="...",
                  use_auth_token=True,
)

Cloning https://huggingface.co/soap945/docstring into local empty directory.


In [None]:
model.save_pretrained("/content/checkpoint")
tokenizer.save_pretrained("/content/checkpoint")

('/content/checkpoint/tokenizer_config.json',
 '/content/checkpoint/special_tokens_map.json',
 '/content/checkpoint/vocab.json',
 '/content/checkpoint/merges.txt',
 '/content/checkpoint/added_tokens.json')

### Push to the huggingface repo created above

In [None]:
repo.push_to_hub(commit_message="Third commit")

Upload file pytorch_model.bin:   0%|          | 3.30k/231M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/soap945/docstring
   e5c5784..c56bbb9  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/soap945/docstring
   e5c5784..c56bbb9  main -> main



'https://huggingface.co/soap945/docstring/commit/c56bbb9e1c906b61c5ea0372edb60377566b546b'

## Generate the comments 
#### (Need to take a while processing depends on how many instances in test data)

In [None]:
def generate_comment(code):
    input_ids = tokenizer(code, return_tensors='pt').input_ids
    generated_ids_after = model.generate(input_ids)
    comment = (tokenizer.decode(generated_ids_after[0], skip_special_tokens=True))

    return comment

generated_comments = []
reference_comments = []
cnt = 0

for index, row in test_data.iterrows():
    if cnt % 200 == 0:
        print(cnt)
    generated_comments.append(generate_comment(row['code_words']))
    reference_comments.append(row['comment'])
    cnt += 1

0


Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will result in indexing errors


200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
10000
10200
10400
10600
10800
11000
11200
11400
11600
11800
12000
12200
12400
12600
12800
13000
13200
13400
13600
13800
14000


## Define BLEU scores with preprocessing

In [None]:
def get_tokens_for_dataset_comment(comment: str) -> List[str]:
    toks = comment.split(" ")
    toks = [t.lower() for t in toks]
    return toks

def get_tokenized_str_for_dataset_comment(comment: str) -> str:
    """Space seperated string of comment tokens"""
    return " ".join(get_tokens_for_dataset_comment(comment))

def _prepare_strings(
    refs: List[str],
    hypotheses: List[str],
    pretokenized: bool = False
) -> Tuple[List[str], List[str]]:
    
    ref_strings = [r.lower() for r in refs]
    hypotheses = [h.lower() for h in hypotheses]

    if not pretokenized:
        ref_strings = [get_tokenized_str_for_dataset_comment(s) for s in ref_strings]
        hypotheses = [get_tokenized_str_for_dataset_comment(s) for s in hypotheses]
    return ref_strings, hypotheses

def eval_bleu_1(
    refs: List[str],
    hypotheses: List[str],
):
    ref_strings, hypotheses = _prepare_strings(refs, hypotheses)
    
    refs = [[r.split()] for r in ref_strings]
    preds = [h.split() for h in hypotheses]

    return nltk.translate.bleu_score.corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method4)*100

In [None]:
bleu1 = eval_bleu_1(reference_comments, generated_comments)
bleu1

0.3954575570752487

## Define the Meteor Score
#### Need to tokenize every instance in the test data. (Dont run this if you are not confident about the computing resources)

In [None]:
# Use meteor_score, remember to convert reference to nd array. Single_meteor_score do not need to convert.
# This is not mentioned in the official doc
from nltk.translate.meteor_score import single_meteor_score, meteor_score

def meteor_score1(ref:List[str], gen:List[str]):
  meteor_score_list = []
  for ref_sentence, gen_sentence in zip(ref_df, gen_df):
    for i in range(len(ref_df)):
      i = 0
      # Tokenize each row for ref_df and gen_df
      tok_ref_df = nltk.word_tokenize(ref_df[i])
      tok_gen_df = nltk.word_tokenize(gen_df[i])
      i += 1

  # Calculate the meteor score and store in the list
  # score = meteor_score(ref[i], gen[i])*100
  [meteor_score_list.append(meteor_score(ref, gen)) for ref_df, gen_df in zip(ref_df, gen_df)]

  # Calculate the mean value of meteor
  # avg = sum(meteor_score_list)/len(meteor_score_list)
  avg = np.mean(meteor_score_list)

  return avg

meteor_score1([ref_df], gen_df)

## Save the reference comments and generated comments

In [None]:
test_data.to_csv('docstring_test_data.csv', index=False)

In [None]:
generated_and_referenced_comments = pd.DataFrame(columns=['doc_id', 'generated_comment', 'reference_comment'])
for i in range(len(generated_comments)):
  generated_and_referenced_comments = generated_and_referenced_comments.append({'doc_id':test_data.iloc[i]['doc_id'], 'generated_comment':generated_comments[i], 'reference_comment':reference_comments[i]},ignore_index=True)
generated_and_referenced_comments

Unnamed: 0,doc_id,generated_comment,reference_comment
0,train19316,&apos;Return the number of CPUs currently runn...,&apos;Get the number of CPU processes on the c...
1,train31156,&apos;Display the info page. &apos;,&apos;In@@ fo page ( link from main header ) &...
2,train19086,&apos;Decorator to decorate a function that re...,&apos;A decorator that tests that the decorate...
3,train59453,&apos;Return the safe string for use in a shel...,&apos;Return a taxon identifier according to N...
4,train61981,&apos;Return True if the url is safe to be use...,&apos;Return ``True`` if the url is a safe red...
...,...,...,...
14167,train52113,&apos;Return the realm for the given entity DC...,&apos;Get the default realm ( = the immediate ...
14168,train5439,&apos;Remove a pidfile from the system. &apos;,&apos;Remove the named PID file if it exists ....
14169,train33081,&apos;Plot a matplotlib figure. DCNL Parameter...,&apos;Re@@ plot a matplotlib figure with plotl...
14170,train17305,&apos;Return a fresh instance of the hash obje...,&apos;Return a fresh instance of the hash obje...


In [None]:
generated_and_referenced_comments.to_csv('docstring_comparison_data.csv', index=False)