In [48]:
import pandas as pd
from scipy.stats import chi2_contingency, ttest_ind

# RQ 3

In [34]:
# read csv
rq34 = pd.read_csv("./data/rq_3_4.csv")
rq34.head()

Unnamed: 0,text_sentiment,product,issue_id,comment_id,priority,issuetype
0,neutral,AAR,12963247.0,15972995.0,Major,bug
1,neutral,AAR,12963247.0,15972996.0,Major,bug
2,neutral,AAR,12963270.0,15261151.0,Major,bug
3,neutral,AAR,12963426.0,15261824.0,Major,bug
4,neutral,AAR,12963426.0,15262448.0,Major,bug


In [35]:
len(rq34)

3814162

In [36]:
# get total text_sentiment class count for each priority, make the sentiment classes into columns
rq4 = rq34.groupby(['priority', 'text_sentiment']).size()
rq4 = rq4.unstack().reset_index()
rq4.columns.name = None
rq4

Unnamed: 0,priority,negative,neutral,positive
0,Blocker,39716,102122,44046
1,Critical,49265,134894,55221
2,Major,439509,1531149,638294
3,Minor,117301,393634,177164
4,Trivial,13689,51742,26416


In [44]:
# Separate the data into major and non-major priority issues
major_issues = rq4[rq4['priority'] == 'Major']
other_issues = rq4[rq4['priority'] != 'Major']

In [45]:
# Sum the counts for sentiments
major_negative = major_issues['negative'].sum()
major_total = major_issues[['neutral', 'negative', 'positive']].sum().sum()
other_negative = other_issues['negative'].sum()
other_total = other_issues[['neutral', 'negative', 'positive']].sum().sum()

In [49]:
# Create a contingency table for the Chi-squared test
contingency_table = [
    [major_negative, major_total - major_negative],
    [other_negative, other_total - other_negative]
]
contingency_table

[[439509, 2169443], [219971, 985239]]

In [50]:
# Perform Chi-squared test
chi2, p_chi2, dof, expected = chi2_contingency(contingency_table)

In [51]:
# Prepare data for T-test
# Create a new DataFrame where each row represents a sentiment
major_sentiments = major_issues[['neutral', 'negative', 'positive']].apply(
    lambda x: ['neutral'] * x['neutral'] + ['negative'] * x['negative'] + ['positive'] * x['positive'], axis=1).explode()
other_sentiments = other_issues[['neutral', 'negative', 'positive']].apply(
    lambda x: ['neutral'] * x['neutral'] + ['negative'] * x['negative'] + ['positive'] * x['positive'], axis=1).explode()

In [52]:
# Map sentiment values to numerical values for performing the T-test
sentiment_map = {'negative': -1, 'neutral': 0, 'positive': 1}
major_sentiments = major_sentiments.map(sentiment_map)
other_sentiments = other_sentiments.map(sentiment_map)

In [53]:
# Perform T-test
t_stat, p_ttest = ttest_ind(major_sentiments.dropna(), other_sentiments.dropna(), equal_var=False)

In [54]:
# Output the results
print(f'Chi-squared test results:')
print(f'Chi2 statistic: {chi2}')
print(f'P-value: {p_chi2}')

Chi-squared test results:
Chi2 statistic: 1138.63704654292
P-value: 1.3227376708930362e-249


In [55]:
if p_chi2 < 0.05:
    print('Chi-squared test: There is a significant difference in negative sentiment for major priority issues.')
else:
    print('Chi-squared test: There is no significant difference in negative sentiment for major priority issues.')

Chi-squared test: There is a significant difference in negative sentiment for major priority issues.


In [56]:
print(f'T-test results:')
print(f'T-statistic: {t_stat}')
print(f'P-value: {p_ttest}')

T-test results:
T-statistic: 10.380499849999218
P-value: 3.045712223749261e-25


In [57]:
if p_ttest < 0.05:
    print('T-test: There is a significant difference in sentiment for major priority issues.')
else:
    print('T-test: There is no significant difference in sentiment for major priority issues.')

T-test: There is a significant difference in sentiment for major priority issues.
