In [1]:
%pip install emoji



In [2]:
from google.colab import drive
drive.mount('/content/drive')

path_to_dataset_folder = '/content/drive/My Drive/SP24/Adv SE/SE Sentiment/Dataset/Raw Dataset'
path_to_processed_dataset_folder = '/content/drive/My Drive/SP24/Adv SE/SE Sentiment/Dataset/Processed Dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import requests
import json
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import words
import emoji
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os

In [4]:
if os.path.exists(path_to_dataset_folder):
  print("Raw Dataset Directory exists")
else:
  print("Raw Dataset Directory not exists")

if os.path.exists(path_to_processed_dataset_folder):
  print("Processed Dataset Directory exists")
else:
  print("Processed Dataset Directory not exists")

Raw Dataset Directory exists
Processed Dataset Directory exists


In [5]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [6]:


# Initialize NLTK components
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()
english_words = set(words.words())

def process_comments(comments):
    """
    Process the given comments by removing short comments, removing duplicate comments,
    and performing NLTK text processing steps. Also includes the source type for each comment.

    Args:
    comments (list of dicts): The list of comments, where each comment is represented as a dictionary
        with keys such as 'id', 'body', and 'pull_request'.

    Returns:
    list of dicts: The processed comments, where each comment dictionary includes the original 'id',
        the processed 'body', and the 'source' type.
    """

    processed_comments = []
    processed_texts = set()  # To keep track of processed texts for duplicate checking

    for comment in comments:

        if len(processed_comments) >= 10000:
            break

        body = comment["body"]
        comment_id = comment["id"]
        source = "pull_request" if comment.get("pull_request") else "issue"

         # Convert emojis to text representations
        body = emoji.demojize(body)

        # Remove URLs
        body = re.sub(r'http\S+', '', body)

        # Remove numbers
        body = re.sub(r'\d+', '', body)

        # Remove stop words
        tokens = word_tokenize(body)
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

        # Remove single characters and non-English words
        filtered_tokens = [word for word in filtered_tokens if len(word) > 1 and word.lower() in english_words]


        # Remove unnecessary characters (punctuation, special symbols)
        body = ' '.join(filtered_tokens)
        body = re.sub(r'[^\w\s]', '', body)

        # Lowercase all text
        body = body.lower()

        # Check if word count is less than 3, and if so, ignore this comment
        if len(body.split()) < 4:
            continue

        # Check for similarity with previously processed comments
        if body in processed_texts:
            continue

        # Add the body to processed_texts to track duplicates
        processed_texts.add(body)

        # Perform NLTK text processing steps
        tokens = word_tokenize(body)
        #filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        stemmed_tokens = [porter.stem(word) for word in tokens]
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.lower() not in stop_words]
        processed_body = " ".join(lemmatized_tokens)

        # Append processed comment to processed_comments list
        processed_comments.append({
            "id": comment_id,
            "original_comment": comment["body"],
            "comment": processed_body,
            "source": source, })

    return processed_comments



In [7]:
# Get sentiment from VADER

sentiVader = SentimentIntensityAnalyzer()
def get_sentiment_from_vader(comment):
  sentiment = sentiVader.polarity_scores(comment)
  compound_score = sentiment['compound']

  if (compound_score) >= 0.5:
    return 'positive'
  elif (compound_score <= 0.5):
    return 'negative'
  elif (compound_score > -0.5 and compound_score < 0.5):
    return 'neutral'


In [8]:
# Get sentiment from Textblob

from textblob import TextBlob

def get_sentiment_from_textblob(comment):
  # Perform sentiment analysis
  blob = TextBlob(comment)
  sentiment_score = blob.sentiment.polarity

  # Interpret sentiment score
  if sentiment_score > 0:
    return 'positive'
  elif sentiment_score < 0:
    return 'negative'
  else:
    return 'neutral'

In [9]:
%pip install pattern



In [10]:
# Get sentiment from Pattern

from pattern.en import sentiment

def get_sentiment_from_pattern(comment):

  polarity, subjectivity = sentiment(comment)

  # Interpret sentiment score
  if polarity > 0.1:
    return 'positive'
  elif polarity < -0.1:
    return 'negative'
  else:
    return 'neutral'

In [11]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import pipeline

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Create a pipeline for sentiment analysis
bert_nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

def get_sentiment_from_bert(comment):
  # Get sentiment
  result = bert_nlp(comment)
  return result['label']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
%pip install spacy
%pip install spacytextblob




In [13]:
# Get sentiment from Spacy
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
# Load the spaCy model
spacy_nlp = spacy.load('en_core_web_sm')

# Add SpacyTextBlob to the pipeline
spacy_nlp.add_pipe('spacytextblob')

def get_sentiment_from_spacy(comment):
  doc = spacy_nlp(comment)
  polarity = doc._.polarity
  if polarity > 0:
    return "positive"
  elif polarity < 0:
    return "negative"
  else:
    return "neutral"

In [14]:
from statistics import mode, StatisticsError

def apply_max_voting(df):
  sentiment_columns = ['sentiment_VADER', 'sentiment_textblob', 'sentiment_pattern', 'sentiment_bert', 'sentiment_spacy']

  def get_max_voted_sentiment(row):
    try:
      return mode(row[sentiment_columns])
    except StatisticsError:
      return 'tie'
  df['max_voted_sentiment'] = df.apply(get_max_voted_sentiment, axis = 1)

  return df


In [15]:


# Replace with the repository owner and name you're interested in dmlc/dgl
repository_owner = "dmlc"
repository_name = "dgl"

# API endpoint for listing commits
url = f"https://api.github.com/repos/{repository_owner}/{repository_name}/issues/comments?sort=created&direction=desc"

# Set your personal access token if you have rate limits or require private repository access
headers = {"Authorization": f"token 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"}

# Set the desired number of commits and adjust "per_page" to 100 (API limit)
num_comments = 10000
comments_per_page = 100

# Calculate the number of pages required
num_pages = (num_comments // comments_per_page) + 1

# Initialize an empty list to store all retrieved commits
all_comments = []

# Iterate through pages and fetch commits
for page in range(1, num_pages + 1):
    parameters = {
        "per_page": comments_per_page,
        "page": page
    }

    try:
        response = requests.get(url, headers=headers, params=parameters)
        response.raise_for_status()  # Raise an exception for non-2xx status codes

        commits = json.loads(response.content)

        # Add retrieved commits to the list
        all_comments.extend(commits)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page}: {e}")





In [16]:
comments_df = pd.DataFrame(columns=['id', 'original_comment', 'processed_comment', 'source', 'sentiment_VADER', 'sentiment_textblob', 'sentiment_pattern', 'sentiment_bert', 'sentiment_spacy'])
processed_comments = process_comments(all_comments)

# Extract and print the desired information from all commits
for index, comment in enumerate(processed_comments[:10000]):  # Truncate to get only the last 500
    id = comment["id"]
    original_comment = comment["original_comment"]
    processed_comment = comment["comment"]
    source = comment["source"]
    sentiment_vader = get_sentiment_from_vader(processed_comment)
    sentiment_textblob = get_sentiment_from_textblob(processed_comment)
    sentiment_pattern = get_sentiment_from_pattern(processed_comment)
    sentiment_bert = get_sentiment_from_pattern(processed_comment)
    sentiment_spacy = get_sentiment_from_spacy(processed_comment)
    comments_df = pd.concat([comments_df, pd.DataFrame({"id": [id],
                                                        "source": [source],
                                                        "original_comment": [original_comment],
                                                        "processed_comment": [processed_comment],
                                                        "sentiment_VADER": [sentiment_vader],
                                                        "sentiment_textblob": [sentiment_textblob],
                                                        "sentiment_pattern": [sentiment_pattern],
                                                        "sentiment_bert": [sentiment_bert],
                                                        "sentiment_spacy": [sentiment_spacy]})], ignore_index = True)

comments_df.head()
#comments_df.to_csv("raw_comments.csv", index = False)

Unnamed: 0,id,original_comment,processed_comment,source,sentiment_VADER,sentiment_textblob,sentiment_pattern,sentiment_bert,sentiment_spacy
0,2040965157,@nv-dlasalle GraphBolt will have full support ...,full support going even paper provide even cheap,issue,negative,positive,positive,positive,positive
1,2040964230,@frozenbugs is there anyone else who can take ...,anyone else take issue support homogenous hete...,issue,positive,positive,positive,positive,positive
2,2040963529,I believe the regression tests and optimized v...,believe regression version kernel feel free op...,issue,positive,positive,positive,positive,positive
3,2040962367,@frozenbugs is there more to be done for this ...,done issue already support feature copy overla...,issue,negative,neutral,neutral,neutral,neutral
4,2040962087,What new functionality will this class support...,new functionality class support taking account...,issue,positive,negative,neutral,neutral,negative


In [17]:
comments_max_vote_df = apply_max_voting(comments_df.copy())
comments_max_vote_df

Unnamed: 0,id,original_comment,processed_comment,source,sentiment_VADER,sentiment_textblob,sentiment_pattern,sentiment_bert,sentiment_spacy,max_voted_sentiment
0,2040965157,@nv-dlasalle GraphBolt will have full support ...,full support going even paper provide even cheap,issue,negative,positive,positive,positive,positive,positive
1,2040964230,@frozenbugs is there anyone else who can take ...,anyone else take issue support homogenous hete...,issue,positive,positive,positive,positive,positive,positive
2,2040963529,I believe the regression tests and optimized v...,believe regression version kernel feel free op...,issue,positive,positive,positive,positive,positive,positive
3,2040962367,@frozenbugs is there more to be done for this ...,done issue already support feature copy overla...,issue,negative,neutral,neutral,neutral,neutral,neutral
4,2040962087,What new functionality will this class support...,new functionality class support taking account...,issue,positive,negative,neutral,neutral,negative,negative
...,...,...,...,...,...,...,...,...,...,...
1465,1519393512,@chang-l could you help take a look at this? I...,could help take look blocking new release,issue,negative,positive,positive,positive,positive,positive
1466,1518932262,"Hi @cccusername , actually you don't need to u...",hi actually need understand follow create,issue,negative,neutral,neutral,neutral,neutral,neutral
1467,1518932017,"Hi @NitishOritro , it seems that you are assig...",hi tensor graph make device assignment,issue,negative,neutral,neutral,neutral,neutral,neutral
1468,1518931691,Hi @ZoroSunCT . Sorry that DGL currently does ...,hi sorry currently support loading large graph...,issue,positive,negative,negative,negative,negative,negative


In [18]:
tie_count = comments_max_vote_df['max_voted_sentiment'].value_counts().get('tie', 0)

print("Number of ties:", tie_count)

Number of ties: 0


In [19]:
comments_max_vote_df.to_csv(f"{path_to_processed_dataset_folder}/{repository_owner}_{repository_name}_sentiment.csv", index = False)

In [20]:
path_to_processed_final_folder = '/content/drive/My Drive/SP24/Adv SE/SE Sentiment/Dataset/Processed Dataset/Final'

In [21]:
if os.path.exists(path_to_processed_final_folder):
  print("Final Processed Dataset Exists")
else:
  print("Final processed dataset doesn't exist")


Final Processed Dataset Exists


In [22]:
comments_max_vote_df.columns

Index(['id', 'original_comment', 'processed_comment', 'source',
       'sentiment_VADER', 'sentiment_textblob', 'sentiment_pattern',
       'sentiment_bert', 'sentiment_spacy', 'max_voted_sentiment'],
      dtype='object')

In [23]:
columns_to_drop = ['id', 'original_comment', 'source', 'sentiment_VADER', 'sentiment_textblob', 'sentiment_pattern', 'sentiment_bert', 'sentiment_spacy']
final_dataset = comments_max_vote_df.drop(columns = columns_to_drop)
columns_to_rename = {
    'processed_comment' : 'comment',
    'max_voted_sentiment': 'sentiment'
}

final_dataset.rename(columns = columns_to_rename, inplace = True)

final_dataset

Unnamed: 0,comment,sentiment
0,full support going even paper provide even cheap,positive
1,anyone else take issue support homogenous hete...,positive
2,believe regression version kernel feel free op...,positive
3,done issue already support feature copy overla...,neutral
4,new functionality class support taking account...,negative
...,...,...
1465,could help take look blocking new release,positive
1466,hi actually need understand follow create,neutral
1467,hi tensor graph make device assignment,neutral
1468,hi sorry currently support loading large graph...,negative


In [24]:
final_dataset.to_csv(f'{path_to_processed_final_folder}/{repository_owner}_{repository_name}_sentiment.csv', index = False)