# Step 4 - Conducting Analysis

This notebook takes in the data that was refined and outputted in Step 3, and runs analysis on it by producing several tables from that dataset. These tables contain several meaningful insights, which are observed and considered in the Readme.MD file.

In [1]:
#This cell defines constants and imports the notebook will use
import os, json
import pandas as pd

refined_data_path = "refined_data/wp_scored_city_articles_by_state.csv"

In [2]:
#This cell imports the outputted dataset from Step 3 and displays 
#it for a quick sanity check

wp_scored_city_articles_by_state = pd.read_csv(refined_data_path)
wp_scored_city_articles_by_state

Unnamed: 0,state,regional_division,population,article_title,revision_id,article_quality
0,Indiana,East North Central,6833037.0,"Princes Lakes, Indiana",1019560872,Stub
1,Michigan,East North Central,10034113.0,"Montrose Charter Township, Michigan",101989287,Stub
2,New York,Middle Atlantic,19677151.0,"Holcomb, New York",1024625267,Stub
3,Idaho,Mountain,1939033.0,"Moyie Springs, Idaho",1032509578,C
4,Hawaii,Pacific,1440196.0,"Kilauea, Hawaii",1036838051,Stub
...,...,...,...,...,...,...
21509,Hawaii,Pacific,1440196.0,"Omao, Hawaii",965363482,Stub
21510,Illinois,East North Central,12582032.0,"Chicago, Illinois",967239270,Stub
21511,Hawaii,Pacific,1440196.0,"Waikane, Hawaii",982615222,Stub
21512,Hawaii,Pacific,1440196.0,"Kailua, Honolulu County, Hawaii",985095782,Stub


In [3]:
def is_high_quality(article_category):
    if article_category == "FA":
        return True;
    if article_category == "GA":
        return True;
    return False;
wp_scored_city_articles_by_state['is_high_quality_article'] = [is_high_quality(x) for x in list(wp_scored_city_articles_by_state['article_quality'])]
high_quality_subset = wp_scored_city_articles_by_state[wp_scored_city_articles_by_state['is_high_quality_article'] == True]

states_articles_agg = wp_scored_city_articles_by_state.groupby(['state', 'population'])['article_title'].agg('count')
states_articles_agg = states_articles_agg.reset_index(name = "article_count")

high_quality_subset_agg = high_quality_subset.groupby(['state', 'population'])['article_title'].agg('count')
high_quality_subset_agg = high_quality_subset_agg.reset_index(name = "hq_article_count")

states_articles_agg['article_count_over_population'] = states_articles_agg['article_count']/states_articles_agg['population']
high_quality_subset_agg['hq_article_count_over_population'] = high_quality_subset_agg['hq_article_count']/high_quality_subset_agg['population']

In [44]:
states_articles_agg.sort_values(by=['article_count_over_population'], ascending=False).head(10)

Unnamed: 0,state,population,article_count,article_count_over_population
42,Vermont,647064.0,329,0.000508
31,North Dakota,779261.0,356,0.000457
17,Maine,1385340.0,483,0.000349
38,South Dakota,909824.0,311,0.000342
13,Iowa,3200517.0,1042,0.000326
1,Alaska,733583.0,149,0.000203
35,Pennsylvania,12972008.0,2556,0.000197
20,Michigan,10034113.0,1773,0.000177
47,Wyoming,581381.0,99,0.00017
26,New Hampshire,1395231.0,234,0.000168


In [45]:
states_articles_agg.sort_values(by=['article_count_over_population'], ascending=True).head(10)

Unnamed: 0,state,population,article_count,article_count_over_population
30,North Carolina,10698973.0,50,5e-06
25,Nevada,3177772.0,19,6e-06
4,California,39029342.0,482,1.2e-05
2,Arizona,7359197.0,91,1.2e-05
43,Virginia,8683619.0,133,1.5e-05
7,Florida,22244823.0,411,1.8e-05
33,Oklahoma,4019800.0,75,1.9e-05
14,Kansas,2937150.0,63,2.1e-05
18,Maryland,6164660.0,157,2.5e-05
46,Wisconsin,5892539.0,191,3.2e-05


In [46]:
high_quality_subset_agg.sort_values(by=['hq_article_count_over_population'], ascending=False).head(10)

Unnamed: 0,state,population,hq_article_count,hq_article_count_over_population
42,Vermont,647064.0,45,7e-05
47,Wyoming,581381.0,39,6.7e-05
38,South Dakota,909824.0,56,6.2e-05
45,West Virginia,1775156.0,106,6e-05
24,Montana,1122867.0,55,4.9e-05
26,New Hampshire,1395231.0,63,4.5e-05
35,Pennsylvania,12972008.0,566,4.4e-05
23,Missouri,6177957.0,263,4.3e-05
1,Alaska,733583.0,31,4.2e-05
27,New Jersey,9261699.0,379,4.1e-05


In [49]:
high_quality_subset_agg.sort_values(by=['hq_article_count_over_population'], ascending=True).head(10)

Unnamed: 0,state,population,hq_article_count,hq_article_count_over_population
30,North Carolina,10698973.0,21,2e-06
43,Virginia,8683619.0,18,2e-06
25,Nevada,3177772.0,8,3e-06
2,Arizona,7359197.0,24,3e-06
4,California,39029342.0,174,4e-06
7,Florida,22244823.0,118,5e-06
29,New York,19677151.0,110,6e-06
18,Maryland,6164660.0,42,7e-06
14,Kansas,2937150.0,22,7e-06
33,Oklahoma,4019800.0,31,8e-06


In [9]:
division_population_agg = pd.DataFrame({
    "state": wp_scored_city_articles_by_state['state'],
    "regional_division":  wp_scored_city_articles_by_state['regional_division'],
    "population": wp_scored_city_articles_by_state['population']
}).drop_duplicates() 
division_population_agg = division_population_agg.groupby(['regional_division'])['population'].agg('sum')
division_population_agg = division_population_agg.reset_index(name = "population")

division_aticles_agg = wp_scored_city_articles_by_state.groupby(['regional_division'])['article_title'].agg('count')
division_aticles_agg = division_aticles_agg.reset_index(name = "article_count")

division_agg = division_population_agg.join(division_aticles_agg.set_index('regional_division'), on='regional_division')
division_agg['article_count_over_population'] = division_agg['article_count']/division_agg['population']

hq_division_aticles_agg = high_quality_subset.groupby(['regional_division'])['article_title'].agg('count')
hq_division_aticles_agg = hq_division_aticles_agg.reset_index(name = "article_count")

hq_division_agg = division_population_agg.join(hq_division_aticles_agg.set_index('regional_division'), on='regional_division')
hq_division_agg['hq_article_count_over_population'] = hq_division_agg['article_count']/hq_division_agg['population']
hq_division_agg

Unnamed: 0,regional_division,population,article_count,hq_article_count_over_population
0,East North Central,47097779.0,718,1.5e-05
1,East South Central,19578002.0,316,1.6e-05
2,Middle Atlantic,41910858.0,1055,2.5e-05
3,Mountain,25514320.0,334,1.3e-05
4,New England,11503343.0,225,2e-05
5,Pacific,53229044.0,491,9e-06
6,South Atlantic,66781137.0,526,8e-06
7,West North Central,19721893.0,639,3.2e-05
8,West South Central,41685250.0,633,1.5e-05


In [10]:
division_agg.sort_values(by=['article_count_over_population'], ascending=True)

Unnamed: 0,regional_division,population,article_count,article_count_over_population
5,Pacific,53229044.0,1304,2.4e-05
6,South Atlantic,66781137.0,1849,2.8e-05
3,Mountain,25514320.0,1187,4.7e-05
8,West South Central,41685250.0,2100,5e-05
1,East South Central,19578002.0,1528,7.8e-05
2,Middle Atlantic,41910858.0,3779,9e-05
0,East North Central,47097779.0,4753,0.000101
4,New England,11503343.0,1437,0.000125
7,West North Central,19721893.0,3577,0.000181


In [11]:
hq_division_agg.sort_values(by=['hq_article_count_over_population'], ascending=True)

Unnamed: 0,regional_division,population,article_count,hq_article_count_over_population
6,South Atlantic,66781137.0,526,8e-06
5,Pacific,53229044.0,491,9e-06
3,Mountain,25514320.0,334,1.3e-05
8,West South Central,41685250.0,633,1.5e-05
0,East North Central,47097779.0,718,1.5e-05
1,East South Central,19578002.0,316,1.6e-05
4,New England,11503343.0,225,2e-05
2,Middle Atlantic,41910858.0,1055,2.5e-05
7,West North Central,19721893.0,639,3.2e-05
