## Use Wikipedia API to obtain featured and non-featured articles
To study the difference in sentiment of featured and non-featured articles, a set of 75 articles from each category was obtained. For featured articles, the Wikipedia API was used to obtain a list of article names within the featured articles category. 75 random titles were then chosen from this list. For non-featured articles, the random function of the Wikipedia API was used to obtain an additional 75 articles. For each of the 125 total articles, the total content of the article was obtained as well as the page summary.

In [None]:
import wikipedia
import random
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt 
import seaborn as sns
import nltk
nltk.download("stopwords")

#set seed so same results will be displayed each time it's run
random.seed(1)
#get links of pages contained in featured articles category
all_featured_articles = wikipedia.WikipediaPage(title = "Wikipedia:Featured_articles").links
#get random sample of 75 featured articles
featured_article_titles = random.sample(all_featured_articles, 75)
featured_art_dict = dict.fromkeys(featured_article_titles)
non_featured_article_titles = []

#get 75 random wikipedia articles that aren't featured articles
while len(non_featured_article_titles) < 75:
    random_page = wikipedia.random()
    
    # make sure not already in list and no in featured articles
    if random_page not in non_featured_article_titles and random_page not in all_featured_articles:
        non_featured_article_titles.append(random_page)

non_featured_art_dict = dict.fromkeys(non_featured_article_titles)

In [None]:
def get_content(article_titles, art_dict):
    '''
    For each article title, get content and summary from Wikipedia page and return dictionary with that information
    '''
    for p in article_titles:
        try:
            page = wikipedia.WikipediaPage(p)
            content = page.content
            summary = page.summary
            
        #occurs when multiple wikipedia pages have same beginning of page title
        except wikipedia.DisambiguationError as e:
            pass

        art_dict[p] = {"content": content,
                      "summary":summary}
        
    return art_dict

## Create Dataframes of Featured and Non-featured Posts

In [None]:
featured_art_dict = get_content(featured_article_titles, featured_art_dict)
non_featured_art_dict = get_content(non_featured_article_titles, non_featured_art_dict)

In [None]:
#create dataframe with page content and summary information
featured_df = pd.DataFrame.from_dict(featured_art_dict, orient='index')
non_featured_df = pd.DataFrame.from_dict(non_featured_art_dict, orient='index')

In [None]:
#remove of references and bibliography sections
featured_df['content_cleaned'] = featured_df['content'].apply(lambda x: x.split("References")[0])
non_featured_df['content_cleaned'] = non_featured_df['content'].apply(lambda x: x.split("References")[0])

#clean unnecessary characters
featured_df['content_cleaned'] = featured_df['content_cleaned'].replace(r'(\\n)|=|(\\)', '', regex=True)
non_featured_df['content_cleaned'] = non_featured_df['content_cleaned'].replace(r'(\\n)|=|(\\)', '', regex=True)

In [None]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words("english"))

def remove_stopwords(text, stopWords):
    '''
    Removes stopwords from given line of text and returns cleaned text
    '''
    cleaned_text = ' '.join([word for word in text.split() if word not in stopwords.words("english")])
    return cleaned_text

featured_df['content_cleaned'] = featured_df['content_cleaned'].apply(lambda x: remove_stopwords(x, stopWords))
non_featured_df['content_cleaned'] = non_featured_df['content_cleaned'].apply(lambda x: remove_stopwords(x, stopWords))

## Calculating Sentiment and Making Comparison

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

featured_df['sentiment_score_all'] = 0
non_featured_df['sentiment_score_all'] = 0

featured_df['sentiment_score_sum'] = 0
non_featured_df['sentiment_score_sum'] = 0

def sentiment_scores(sentence):
    '''
    Use Vader Sentiment model to return neutral sentiment score for a given sentence
    '''
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    
    return sentiment_dict['neu']
        
#get sentiment scores for entire text and summary
featured_df['sentiment_score_all'] = featured_df['content_cleaned'].apply(lambda x: sentiment_scores(x))
non_featured_df['sentiment_score_all'] = non_featured_df['content_cleaned'].apply(lambda x: sentiment_scores(x))

featured_df['sentiment_score_sum'] = featured_df['summary'].apply(lambda x: sentiment_scores(x))
non_featured_df['sentiment_score_sum'] = non_featured_df['summary'].apply(lambda x: sentiment_scores(x))

In [None]:
def conduct_t_test(group_1, group_2):
    '''
    Conduct a t test comparing two arrays and print statement about significance depending on resulting p-value
    '''
    dof = min(len(group_1),len(group_2)) - 1
    t_stat, p_val = stats.ttest_ind(group_1, group_2, equal_var = False) 
    alpha = 0.05
    group1_avg = np.average(group_1)
    group2_avg = np.average(group_2)
    print(f"Featured pages avg: {group1_avg:.3f}")
    print(f"Non Featured pages avg: {group2_avg:.3f}")
    print(f"Difference in Means: {group1_avg - group2_avg:.2f}")
    print(f"DF: {dof}")
    print(f"T-stat: {t_stat:.2f}")
    print(f"P-value: {p_val:.2f}")
    if p_val < alpha:
        print("Reject the null hypothesis; there is a significant difference.")
    else:
        print("Fail to reject the null hypothesis; there is no significant difference.")

In [None]:
#conduct t-test between neutral scores of entire text of featured vs. non-featured pages
featured_values = featured_df['sentiment_score_all'].tolist()
non_featured_values = non_featured_df['sentiment_score_all'].tolist()
conduct_t_test(featured_values, non_featured_values)

In [None]:
#conduct t-test between neutral scores of summary of featured vs. non-featured pages
featured_values_sum = featured_df['sentiment_score_sum'].tolist()
non_featured_values_sum = non_featured_df['sentiment_score_sum'].tolist()
conduct_t_test(featured_values_sum, non_featured_values_sum)

In [None]:
featured_df['type'] = "Featured Pages"
non_featured_df['type'] = "Non-Featured Pages"
total_df = pd.concat([featured_df, non_featured_df])

In [None]:
#create two histograms, one for featured and one for non-featured pages, of number of pages by neutral sentiment score
def hist_plot_1():
    from matplotlib.ticker import StrMethodFormatter
    ax = total_df.hist(column='sentiment_score_sum', by='type', bins=10, grid=False, figsize=(8,10), layout=(3,1), sharex=True, color='#2d76ba', zorder=2, rwidth=0.9)

    for i,x in enumerate(ax):

        # Despine
        x.spines['right'].set_visible(False)
        x.spines['top'].set_visible(False)
        x.spines['left'].set_visible(False)

        # Switch off ticks
        x.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on")

        # Draw horizontal axis lines
        vals = x.get_yticks()
        for tick in vals:
            x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)

        # Set x-axis label
        x.set_xlabel("Neutral Sentiment Score of Summary", labelpad=20, weight='bold', size=12)

        # Set y-axis label
        x.set_ylabel("Number of Pages", labelpad=50, weight='bold', size=12)

        # Format y-axis label
        x.yaxis.set_major_formatter(StrMethodFormatter('{x:,g}'))

        x.tick_params(axis='x', rotation=0)

## Comparisons by Length of Page

In [None]:
#add column with length of cleaned text
featured_df['length'] = featured_df['content_cleaned'].apply(lambda x: len(x.split()))
non_featured_df['length'] = non_featured_df['content_cleaned'].apply(lambda x: len(x.split()))

featured_len_values = featured_df['length'].tolist()
non_featured_len_values = non_featured_df['length'].tolist()

In [None]:
#getting average length by page type
import statistics
print(statistics.mean(featured_len_values))
print(statistics.mean(non_featured_len_values))

In [None]:
#get length of summary
featured_df['length_sum'] = featured_df['summary'].apply(lambda x: len(x.split()))
non_featured_df['length_sum'] = non_featured_df['summary'].apply(lambda x: len(x.split()))

featured_len_sum = featured_df['length_sum'].tolist()
non_featured_len_sum = non_featured_df['length_sum'].tolist()

In [None]:
%config InlineBackend.figure_format = 'svg'

def plot_fig(feature_sen_values, non_feature_sen_values, feature_len, non_feature_len, title): 
    '''
    Create scatterplot of length of content vs. neutral sentiment score
    '''
    plt.scatter(feature_len, feature_sen_values, label = "Featured Pages")
    plt.scatter(non_feature_len, non_feature_sen_values, label = "Non Featured Pages")
    
    plt.xlabel("Number of Words in Page Content")
    plt.ylabel("Compound Polarity Score")

    plt.legend()
    plt.title(title)
    plt.show()

## Getting Non-featured Posts with Length Similar to Featured Posts

In [None]:
min_length = min(featured_len_values)
print(min_length)

In [None]:
non_featured_article_long = []
random.seed(4)

#gets only non-featured pages whose length is at least equal to the minimum length of featured pages' content
while len(non_featured_article_long) < 75:
    random_page = wikipedia.random()
    try:
        content = wikipedia.WikipediaPage(random_page).content
    except wikipedia.DisambiguationError as e:
        pass
        
    page_length = len(content.split())
    
    #checks if greater than or equal to min length of featured pages
    if random_page not in non_featured_article_long and random_page not in all_featured_articles and page_length >= min_length:
        non_featured_article_long.append(random_page)

non_featured_len_dict = dict.fromkeys(non_featured_article_long)

In [None]:
non_featured_len_dict = get_content(non_featured_article_long, non_featured_len_dict)

In [None]:
#various text cleaning functions as used above
non_featured_len_df = pd.DataFrame.from_dict(non_featured_len_dict, orient='index')
non_featured_len_df['content_cleaned'] = non_featured_len_df['content'].apply(lambda x: x.split("References")[0])
non_featured_len_df['content_cleaned'] = non_featured_len_df['content_cleaned'].replace(r'(\\n)|=|(\\)', '', regex=True)
non_featured_len_df['content_cleaned'] = non_featured_len_df['content_cleaned'].apply(lambda x: remove_stopwords(x, stopWords

## Calculating Sentiment and Making Comparisons

In [None]:
non_featured_len_df['sentiment_score_all'] = 0
non_featured_len_df['sentiment_score_all'] = non_featured_len_df['content_cleaned'].apply(lambda x: sentiment_scores(x))

In [None]:
non_featured_len_values = non_featured_len_df['sentiment_score_all'].tolist()
conduct_t_test(featured_values, non_featured_len_values)

In [None]:
featured_df['type'] = "Featured Pages"
non_featured_len_df['type'] = "Non-Featured Pages"
total_len_df = pd.concat([featured_df, non_featured_len_df])

In [None]:
def hist_plot_2():
    from matplotlib.ticker import StrMethodFormatter
    ax = total_len_df.hist(column='sentiment_score_all', by='type', bins=10, grid=False, figsize=(8,10), layout=(3,1), sharex=True, color='#2d76ba', zorder=2, rwidth=0.9)

    for i,x in enumerate(ax):

        # Despine
        x.spines['right'].set_visible(False)
        x.spines['top'].set_visible(False)
        x.spines['left'].set_visible(False)

        # Switch off ticks
        x.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on")

        # Draw horizontal axis lines
        vals = x.get_yticks()
        for tick in vals:
            x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)

        # Set x-axis label
        x.set_xlabel("Neutral Sentiment Score of Text of Pages with Similar Lengths", labelpad=20, weight='bold', size=12)

        # Set y-axis label
        x.set_ylabel("Number of Pages", labelpad=50, weight='bold', size=12)

        # Format y-axis label
        x.yaxis.set_major_formatter(StrMethodFormatter('{x:,g}'))

        x.tick_params(axis='x', rotation=0)

In [None]:
non_featured_len_df['length'] = non_featured_len_df['content_cleaned'].apply(lambda x: len(x.split()))
non_featured_lengths = non_featured_len_df['length'].tolist()