# Analysis of demographic patterns of partisan websites

In [1]:
import json
import os
import glob


dir_name = 'data/Alexa_rank_charts'
json_pattern = os.path.join(dir_name,'*.json')
file_list = glob.glob(json_pattern)
demographics_categories = ['age', 'education', 'ethnicity', 'income']
demographics_dict = {}

In [2]:
# Populating age, education, ethnicity, income and gender categories from the data 
first_json = json.loads(open(file_list[0]).read())
first_url = first_json['metrics'].keys()
age_ranges = first_json['metrics'][first_url[0]]['demographics']['age'].keys()
education_types = first_json['metrics'][first_url[0]]['demographics']['education'].keys()
ethnicity_types = first_json['metrics'][first_url[0]]['demographics']['ethnicity'].keys()
income_ranges = first_json['metrics'][first_url[0]]['demographics']['income'].keys()
gender_types = ['male', 'female']

In [3]:
from partisan_sites_parser import *
from referer_parser import *
partisan_websites = sites
left_sites = left
right_sites = right

# gender dict contains Alexa's representation based on gender
gender_dict = {}
for each_file in file_list:
    json_data = open(each_file).read()
    try:
        data = json.loads(json_data)
        url = data['metrics'].keys()[0]
        if data['metrics'][url]['demographics']['gender'] != []:
            gender_dict[url] = {}
            if data['metrics'][url]['demographics']['gender']['m']['score'] != {}:
                gender_dict[url]['male'] = data['metrics'][url]['demographics']['gender']['m']['score']
            if data['metrics'][url]['demographics']['gender']['f']['score'] != '':
                gender_dict[url]['female'] = data['metrics'][url]['demographics']['gender']['f']['score'] 
    except  ValueError, e:
        print e, each_file

In [4]:
for key in gender_dict.keys():
    if key not in partisan_websites:
        gender_dict.pop(key)

In [63]:
no_demo_data = []
for x in partisan_websites:
    if x not in gender_dict.keys():
        no_demo_data.append(x)
        

In [65]:
len(no_demo_data)

247

In [6]:
gender_left = {}
gender_right = {}
for key, val in gender_dict.items():
    if key in left:
        gender_left[key] = val
    if key in right:
        gender_right[key] = val

In [66]:
len(gender_right)

290

In [7]:

def get_cumulative_gender_dict(gender_type_dict):
    each_gender_dict = {}
    for key, val in gender_type_dict.items():
        if 'female' in val.keys():
            if 'female' not in each_gender_dict.keys():
                each_gender_dict['female'] = []
                each_gender_dict['female'].append(int(val['female']))
            else:
                each_gender_dict['female'].append(int(val['female']))
        if 'male' in val.keys():
            if 'male' not in each_gender_dict.keys():
                each_gender_dict['male'] = []
                each_gender_dict['male'].append(int(val['male']))
            else:
                each_gender_dict['male'].append(int(val['male']))
    return each_gender_dict    

In [8]:
gender_left_dict = get_cumulative_gender_dict(gender_left)
gender_right_dict = get_cumulative_gender_dict(gender_right)

In [9]:
# Population demographics for each category out of age, education, ethnicity and income

for each_file in file_list:
    json_data = open(each_file).read()
    try:
        data = json.loads(json_data)
        url = data['metrics'].keys()[0]
        demographics_dict[url] = {}
        for category in demographics_categories:
            demographics_dict[url][category] = {}
        if data['metrics'][url]['demographics']['age'] != []:
            for age_range in age_ranges: 
                demographics_dict[url]['age'][age_range] = data['metrics'][url]['demographics']['age'][age_range]['score']
        if data['metrics'][url]['demographics']['education'] != []:
            for education_type in education_types:
                demographics_dict[url]['education'][education_type] = data['metrics'][url]['demographics']['education'][education_type].get('score')
        if data['metrics'][url]['demographics']['ethnicity'] != []:
            for ethnicity_type in ethnicity_types:
                demographics_dict[url]['ethnicity'][ethnicity_type] = data['metrics'][url]['demographics']['ethnicity'][ethnicity_type]['score']
        if data['metrics'][url]['demographics']['income'] != []:
            for income_range in income_ranges:
                demographics_dict[url]['income'][income_range] = data['metrics'][url]['demographics']['income'][income_range]['score']
    except ValueError, e:
        print e, each_file
        
for key, val in demographics_dict.items():
    if val['age'] == {} and val['education'] == {} and val['ethnicity'] == {} and val['income'] == {}:
        demographics_dict.pop(key)
        

In [10]:
from partisan_sites_parser import *
from referer_parser import *

# Finding demographics for hubs and authorities

hubs_list = list(set(hubs))
hubs_left = left_hubs
hubs_right = right_hubs
demo_hubs_left = {}
demo_hubs_right = {}
partisan_websites = sites
left_sites = left
right_sites = right

for key in demographics_dict.keys():
    if key not in partisan_websites:
        demographics_dict.pop(key)

demographics_left = {}
demographics_right = {}
for key, val in demographics_dict.items():
    if key in left:
        demographics_left[key] = val
    if key in right:
        demographics_right[key] = val

for key, val in demographics_left.items():
    if key in hubs_left:
        demo_hubs_left[key] = val

for key, val in demographics_right.items():
    if key in hubs_right:
        demo_hubs_right[key] = val


In [11]:
missing_data = []
# Cleaning data with missing values
for key, val in demographics_left.items():
    for income in val['income'].keys():
        if val['income'][income] == '':
            missing_data.append(key)
    for age in val['age'].keys():
        if val['age'][age] == '':
            missing_data.append(key)
    for edu in val['education'].keys():
        if val['education'][edu] == '':
            missing_data.append(key)
    for eth in val['ethnicity'].keys():
        if val['ethnicity'][eth] == '':
            missing_data.append(key)

In [12]:
for key, val in demographics_right.items():
    for income in val['income'].keys():
        if val['income'][income] == '':
            missing_data.append(key)
    for age in val['age'].keys():
        if val['age'][age] == '':
            missing_data.append(key)
    for edu in val['education'].keys():
        if val['education'][edu] == '':
            missing_data.append(key)
    for eth in val['ethnicity'].keys():
        if val['ethnicity'][eth] == '':
            missing_data.append(key)

In [13]:
len(set(missing_data))

278

In [14]:
# getting each_category dict with different category types as bins
def get_demo_category_dict(parent_dict, category, category_types):
    demo_category_dict = {}
    for each_val in category_types:
        demo_category_dict[each_val] = {}
    #print demo_category_dict
    for key, val in parent_dict.items():
        if val[category] != {}:
            #print val[category]
            for each_key in val[category].keys():
                #print each_key
                if val[category][each_key] != '':
                    demo_category_dict[each_key][key] = val[category][each_key]   
    return demo_category_dict

In [15]:
#Convert each category into dataframes

age_left_data = get_demo_category_dict(demographics_left, 'age', age_ranges)
age_right_data =  get_demo_category_dict(demographics_right, 'age', age_ranges)
education_left_data = get_demo_category_dict(demographics_left, 'education', education_types)
education_right_data = get_demo_category_dict(demographics_right, 'education', education_types)
ethnicity_left_data = get_demo_category_dict(demographics_left, 'ethnicity', ethnicity_types)
ethnicity_right_data = get_demo_category_dict(demographics_right, 'ethnicity', ethnicity_types)
income_left_data = get_demo_category_dict(demographics_left, 'income', income_ranges)
income_right_data =  get_demo_category_dict(demographics_right, 'income', income_ranges)

In [16]:
import pandas as pd
age_left_df = pd.DataFrame(age_left_data)
age_right_df = pd.DataFrame(age_right_data)

In [17]:
education_left_df = pd.DataFrame(education_left_data)
education_right_df = pd.DataFrame(education_right_data)

In [18]:
ethnicity_left_df = pd.DataFrame(ethnicity_left_data)
ethnicity_right_df = pd.DataFrame(ethnicity_right_data)

In [19]:
income_left_df = pd.DataFrame(income_left_data)
income_right_df = pd.DataFrame(income_right_data)

In [20]:
import numpy as np
from numpy import nan

def remove_missing_data(dataframe, category_types):
    new_dict = {}
    for each_key in category_types:
        new_dict[each_key] = [int(x) for x in dataframe[each_key].dropna()]
    return new_dict


In [21]:
age_left_dict = remove_missing_data(age_left_df, age_ranges)
age_right_dict = remove_missing_data(age_right_df, age_ranges)

In [22]:
education_left_dict = remove_missing_data(education_left_df, education_types)
education_right_dict = remove_missing_data(education_right_df, education_types)

In [23]:
ethnicity_left_dict = remove_missing_data(ethnicity_left_df, ethnicity_types)
ethnicity_right_dict = remove_missing_data(ethnicity_right_df, ethnicity_types)

In [24]:
income_left_dict = remove_missing_data(income_left_df, income_ranges)
income_right_dict = remove_missing_data(income_right_df, income_ranges)

In [25]:
%matplotlib inline

In [26]:
# Normalizing the datasets by using Random Sampling
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
import math

In [27]:
def get_point_estimates(distribution):
    np.random.seed(10)
    point_estimates = []         # Make empty list to hold point estimates
    for x in range(500):         # Generate 200 samples
        sample = np.random.choice(a=distribution, size=10)
        point_estimates.append( sample.mean() )
    return point_estimates

In [56]:
def get_sample_dict(population_dict, category_types):
    sample_dict = {}
    z_critical = stats.norm.ppf(q = 0.975)
    for each_key in category_types:
        sample_dict[each_key] = {}
        point_estimates = get_point_estimates(population_dict[each_key])
        sample_dict[each_key]['mean'] = np.mean(point_estimates)
        sample_dict[each_key]['std'] = z_critical * np.std(population_dict[each_key])/math.sqrt(5000)
    #print z_critical
    return sample_dict

In [54]:
gender_left_sample_dict = get_sample_dict(gender_left_dict, gender_types)
len(gender_left_dict['male'])

1.95996398454


131

In [55]:
gender_right_sample_dict = get_sample_dict(gender_right_dict, gender_types)

1.95996398454


In [57]:
gender_left_sample_df = pd.DataFrame(gender_left_sample_dict)
gender_right_sample_df = pd.DataFrame(gender_right_sample_dict)

In [58]:
age_left_sample_dict = get_sample_dict(age_left_dict, age_ranges)
age_right_sample_dict =  get_sample_dict(age_right_dict, age_ranges)

In [59]:
age_left_sample_df = pd.DataFrame(age_left_sample_dict)
age_right_sample_df = pd.DataFrame(age_right_sample_dict)

In [60]:
age_left_sample_df

Unnamed: 0,18-24,25-34,35-44,45-54,55-64,65+
mean,65.123,71.0928,85.5948,138.2202,270.9718,541.5928
std,0.95508,0.847464,0.856576,1.505123,3.471611,15.535024


In [46]:
edu_left_sample_dict = get_sample_dict(education_left_dict, education_types)
edu_right_sample_dict = get_sample_dict(education_right_dict, education_types)

In [47]:
education_columns = ['NoCollege', 'SomeCollege', 'College', 'GraduateSchool']

In [48]:
edu_left_sample_df = pd.DataFrame(edu_left_sample_dict, columns=education_columns)
edu_right_sample_df = pd.DataFrame(edu_right_sample_dict, columns=education_columns)

In [49]:
eth_left_sample_dict = get_sample_dict(ethnicity_left_dict, ethnicity_types)
eth_right_sample_dict = get_sample_dict(ethnicity_right_dict, ethnicity_types)

In [50]:
ethnicity_columns = ['African','AfricanAmerican','Asian','Caucasian', 'Hispanic', 'MiddleEastern', 'Other' ]

In [183]:
eth_left_sample_df = pd.DataFrame(eth_left_sample_dict, columns=ethnicity_columns)
eth_right_sample_df = pd.DataFrame(eth_right_sample_dict, columns=ethnicity_columns)

In [188]:
inc_left_sample_dict = get_sample_dict(income_left_dict, income_ranges)
inc_right_sample_dict =  get_sample_dict(income_right_dict, income_ranges)

In [189]:
income_columns = ['0-30k', '30-60k','60-100k', '100k+']

In [190]:
inc_left_sample_df = pd.DataFrame(inc_left_sample_dict, columns=income_columns)
inc_right_sample_df = pd.DataFrame(inc_right_sample_dict, columns=income_columns)