In [None]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

from azureml.core import Workspace, Dataset

# Loading the Azure Machine Learning Workspace
workspace = Workspace.from_config()

# Retrieving the dataset by its name 
dataset = Dataset.get_by_name(workspace, name='SentimentAnalysis')

# Loading the dataset into a Pandas DataFrame
df = dataset.to_pandas_dataframe()

# Now 'df' contains the sentiment analysis dataset as a Pandas DataFrame

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.tail()

In [None]:
df['date'] = pd.to_datetime(df['date'], utc=True)


In [None]:
df.dtypes

In [None]:
df.head()

### Commit Frequency Over Time:
### 
Analyzing the distribution of commit messages over time. We created a time series visualizations to see how the commit frequency changes over days, weeks, or months. This can help identify busy periods or trends in development activity.

In [None]:


column_name = 'date'

# Setting 'date' column as the index for time series plotting
df.set_index(column_name, inplace=True)

# Resampling the data by day and count the number of commits for each day
commit_counts = df.resample('D').size()

# Plotting the commit frequency over time
plt.figure(figsize=(12, 6))
commit_counts.plot(title='Commit Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Commits')
plt.show()


### Author Contributions:
### 
We want to identify the top contributors by analyzing the number of commits made by each author. 

In [None]:
author_column = 'author'

# Step 1: Counting the number of commits for each author
author_commit_counts = df[author_column].value_counts()

# Step 2: Plotting the top contributors
plt.figure(figsize=(12, 6))
author_commit_counts.head(10).plot(kind='bar', title='Top 10 Contributors')
plt.xlabel('Author')
plt.ylabel('Number of Commits')
plt.show()


In [None]:
%pip install textblob


In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer


commit_message_column = 'message'

# Downloading VADER lexicon 
nltk.download('vader_lexicon')

# Initializing the VADER sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()

# Step 1: Analyzing sentiment and creating a new column 'sentiment_score' in the DataFrame
df['sentiment_score'] = df[commit_message_column].apply(lambda x: sid.polarity_scores(str(x))['compound'])

In [None]:
'''

# Step 2: Grouping the data by year-month and calculate the average sentiment score
sentiment_over_time = df.groupby(df['date'].dt.to_period("M"))['sentiment_score'].mean()

# Step 3: Plotting the sentiment trends over time
plt.figure(figsize=(12, 6))
sentiment_over_time.plot(title='Average Sentiment Over Time', marker='o', linewidth=2)
plt.xlabel('Year-Month')
plt.ylabel('Average Sentiment Score')
plt.show()
'''

In [None]:
df.head()

We'll visualize the sentiment distribution, explore how sentiment varies across different authors, and investigate sentiment trends specific to different repositories.

This code creates a histogram to show the distribution of sentiment scores in commit messages.

In [None]:
import seaborn as sns


sns.set(style="whitegrid")

# Visualize Sentiment Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment_score'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()


This code uses a boxplot to visualize the distribution of sentiment scores for each author, providing insights into how sentiment varies across different contributors.

In [None]:
# Author-Specific Sentiment
plt.figure(figsize=(14, 8))
sns.boxplot(x='author', y='sentiment_score', data=df, palette='viridis')
plt.title('Author-Specific Sentiment Analysis')
plt.xlabel('Author')
plt.ylabel('Sentiment Score')
plt.xticks(rotation=45, ha='right')
plt.show()


This code creates a boxplot to analyze how sentiment scores vary across different repositories, helping us understand the emotional tone associated with each project.

In [None]:
# Repository-Specific Sentiment
plt.figure(figsize=(14, 8))
sns.boxplot(x='repository_name', y='sentiment_score', data=df, palette='muted')
plt.title('Repository-Specific Sentiment Analysis')
plt.xlabel('Repository')
plt.ylabel('Sentiment Score')
plt.xticks(rotation=45, ha='right')
plt.show()
