## 2016 Election

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
filename = '../input/presidential_polls.csv'
with open(filename,"r") as f:
    col_names_str = f.readline()[:-1] #column names
col_name_list = col_names_str.split(',')
use_col_name_list = ['state','enddate', 'rawpoll_clinton', 'rawpoll_trump','adjpoll_clinton', 'adjpoll_trump']
use_col_index_list = [col_name_list.index(use_col_name) for use_col_name in use_col_name_list]

data_array = np.loadtxt(filename,      # 文件名
                        delimiter=',', # 分隔符
                        skiprows=1,    # 跳过第一行，即跳过列名
                        dtype=str,     # 数据类型
                        usecols=use_col_index_list) # 指定读取的列索引号

print (data_array, data_array.shape)

> #### Handle Datetime

In [None]:
# 处理日期格式数据
enddate_idx = use_col_name_list.index('enddate')
enddate_list = data_array[:,enddate_idx].tolist()

# 将日期字符串格式统一，即'yy/dd/mm'
enddate_list = [enddate.replace('-', '/') for enddate in enddate_list]

# 将日期字符串转换成日期
date_list = [datetime.datetime.strptime(enddate, '%m/%d/%Y') for enddate in enddate_list]

# 构造年份-月份列表
month_list = ['%d-%02d' %(date_obj.year, date_obj.month) for date_obj in date_list]
month_array = np.array(month_list)
months = np.unique(month_array)

In [None]:
# 不同洲
state_idx = use_col_name_list.index('state')
state_array = data_array[:,state_idx]
states = list(np.unique(state_array))

In [None]:
# 统计民意投票数
# cliton
# 原始数据 rawpoll
rawpoll_clinton_idx = use_col_name_list.index('rawpoll_clinton')
rawpoll_clinton_data = data_array[:, rawpoll_clinton_idx]

# 调整后的数据 adhpool
adjpoll_clinton_idx = use_col_name_list.index('adjpoll_clinton')
adjpoll_clinton_data = data_array[:, adjpoll_clinton_idx]

# trump
# 原始数据 rawpoll
rawpoll_trump_idx = use_col_name_list.index('rawpoll_trump')
rawpoll_trump_data = data_array[:, rawpoll_trump_idx]

# 调整后的数据 adjpoll
adjpoll_trump_idx = use_col_name_list.index('adjpoll_trump')
adjpoll_trump_data = data_array[:, adjpoll_trump_idx]

# 结果保存
results = []

def is_convert_float(s):
    """
         判断一个字符串能否转换为float
    """
    try:
        float(s)
    except:
        return False
    return True

def get_sum(str_array):
    """
        返回字符串数组中数字的总和
    """
    # 去掉不能转换成数字的数据
    cleaned_data = filter(is_convert_float, str_array)
    # 转换数据类型
    float_array = np.array(list(cleaned_data), np.float)
    return np.sum(float_array)

for month in months:   
    # clinton
    # 原始数据 rawpoll
    rawpoll_clinton_month_data = rawpoll_clinton_data[month_array == month]  
    # 统计当月的总票数
    rawpoll_clinton_month_sum = get_sum(rawpoll_clinton_month_data)
    
    # 调整数据 adjpoll
    adjpoll_clinton_month_data = adjpoll_clinton_data[month_array == month]  
    # 统计当月的总票数
    adjpoll_clinton_month_sum = get_sum(adjpoll_clinton_month_data)
    
    
    # trump
    # 原始数据 rawpoll
    rawpoll_trump_month_data = rawpoll_trump_data[month_array == month]
    # 统计当月的总票数
    rawpoll_trump_month_sum = get_sum(rawpoll_trump_month_data)
    
    # 调整数据 adjpoll
    adjpoll_trump_month_data = adjpoll_trump_data[month_array == month]
    # 统计当月的总票数
    adjpoll_trump_month_sum = get_sum(adjpoll_trump_month_data)
    
    results.append((month, rawpoll_clinton_month_sum, adjpoll_clinton_month_sum, rawpoll_trump_month_sum, adjpoll_trump_month_sum))
    
print(results)

months, raw_cliton_sum, adj_cliton_sum, raw_trump_sum, adj_trump_sum = zip(*results)

**What are the trends of the polls by month(enddate)?** 

In [None]:
fig, subplot_arr = plt.subplots(2,2, figsize=(15,10))

# 原始数据趋势展示
subplot_arr[0,0].plot(raw_cliton_sum, color='r')
subplot_arr[0,0].plot(raw_trump_sum, color='g')

width = 0.25
x = np.arange(len(months))
subplot_arr[0,1].bar(x, raw_cliton_sum, width, color='r')
subplot_arr[0,1].bar(x + width, raw_trump_sum, width, color='g')
subplot_arr[0,1].set_xticks(x + width)
subplot_arr[0,1].set_xticklabels(months, rotation='vertical')

# 调整数据趋势展示
subplot_arr[1,0].plot(adj_cliton_sum, color='r')
subplot_arr[1,0].plot(adj_trump_sum, color='g')

width = 0.25
x = np.arange(len(months))
subplot_arr[1,1].bar(x, adj_cliton_sum, width, color='r')
subplot_arr[1,1].bar(x + width, adj_trump_sum, width, color='g')
subplot_arr[1,1].set_xticks(x + width)
subplot_arr[1,1].set_xticklabels(months, rotation='vertical')

plt.subplots_adjust(wspace=0.2)

plt.show()

**How do the trends vary by state?**

In [None]:
# condiser adjpoll
result = np.zeros((len(states),len(months)))
for i in range(len(states)):
    for j in range(len(months)):
        a = month_array == months[j]
        b = state_array==states[i]
        c = a&b
        # clinton
        adjpoll_clinton_month_state_data=adjpoll_clinton_data[c]
        if adjpoll_clinton_month_state_data.shape[0]!==0:
            adjpoll_clinton_month_state_sum = get_sum(adjpoll_clinton_month_state_data)
            result[i,j] = adjpoll_clinton_month_state_sum
    # trump
    #adjpoll_trump_month_data = adjpoll_trump_data[month_array == month]
    #adjpoll_trump_month_sum = get_sum(adjpoll_trump_month_data)
    
    

In [None]:
plt.imshow(result, interpolation='nearest', cmap=plt.cm.ocean)
plt.colorbar()
plt.show()