In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import math

In [2]:
normalized_counts = pd.read_csv("normalized_counts.csv", dtype={"Total Population":np.int32, "total_norm":np.float64,
                                                               "total_count":np.int32})
del normalized_counts["Unnamed: 0"]
normalized_counts.drop(8, inplace=True)
normalized_counts.head()

Unnamed: 0,State,Total Population,State Abbr,black,jews,muslim,norm_black,norm_jewish,norm_muslim,total_count,total_norm
0,Alabama,4863300,AL,1.0,1.0,4.0,2.1e-05,2.1e-05,8.2e-05,6,0.000123
1,Alaska,741894,AK,1.0,0.0,1.0,0.000135,0.0,0.000135,2,0.00027
2,Arizona,6931071,AZ,1.0,0.0,4.0,1.4e-05,0.0,5.8e-05,5,7.2e-05
3,Arkansas,2988248,AR,3.0,1.0,3.0,0.0001,3.3e-05,0.0001,7,0.000234
4,California,39250017,CA,30.0,31.0,17.0,7.6e-05,7.9e-05,4.3e-05,78,0.000199


In [3]:
norm_mean = normalized_counts['total_norm'].mean()
norm_mean

0.00018901990647451276

In [4]:
total_pop = normalized_counts['Total Population'].sum()
total_pop

314640326

In [5]:
normalized_counts["Normalized Population"] = ""
normalized_counts.head()

Unnamed: 0,State,Total Population,State Abbr,black,jews,muslim,norm_black,norm_jewish,norm_muslim,total_count,total_norm,Normalized Population
0,Alabama,4863300,AL,1.0,1.0,4.0,2.1e-05,2.1e-05,8.2e-05,6,0.000123,
1,Alaska,741894,AK,1.0,0.0,1.0,0.000135,0.0,0.000135,2,0.00027,
2,Arizona,6931071,AZ,1.0,0.0,4.0,1.4e-05,0.0,5.8e-05,5,7.2e-05,
3,Arkansas,2988248,AR,3.0,1.0,3.0,0.0001,3.3e-05,0.0001,7,0.000234,
4,California,39250017,CA,30.0,31.0,17.0,7.6e-05,7.9e-05,4.3e-05,78,0.000199,


In [6]:
for index,row in normalized_counts.iterrows():
    norm_pop = row['Total Population'] / total_pop
    
    normalized_counts.set_value(index, 'Normalized Population', norm_pop)

In [7]:
normalized_counts.head()

Unnamed: 0,State,Total Population,State Abbr,black,jews,muslim,norm_black,norm_jewish,norm_muslim,total_count,total_norm,Normalized Population
0,Alabama,4863300,AL,1.0,1.0,4.0,2.1e-05,2.1e-05,8.2e-05,6,0.000123,0.0154567
1,Alaska,741894,AK,1.0,0.0,1.0,0.000135,0.0,0.000135,2,0.00027,0.00235791
2,Arizona,6931071,AZ,1.0,0.0,4.0,1.4e-05,0.0,5.8e-05,5,7.2e-05,0.0220286
3,Arkansas,2988248,AR,3.0,1.0,3.0,0.0001,3.3e-05,0.0001,7,0.000234,0.00949735
4,California,39250017,CA,30.0,31.0,17.0,7.6e-05,7.9e-05,4.3e-05,78,0.000199,0.124746


In [8]:
total_tweets = int(normalized_counts['total_count'].sum())
pop = []
for x in range(total_tweets):
    pop.append(1)
    
for x in range(total_pop - total_tweets):
    pop.append(0)

In [10]:
normalized_counts['P-Value'] = ""
normalized_counts.head()

Unnamed: 0,State,Total Population,State Abbr,black,jews,muslim,norm_black,norm_jewish,norm_muslim,total_count,total_norm,Normalized Population,P-Value
0,Alabama,4863300,AL,1.0,1.0,4.0,2.1e-05,2.1e-05,8.2e-05,6,0.000123,0.0154567,
1,Alaska,741894,AK,1.0,0.0,1.0,0.000135,0.0,0.000135,2,0.00027,0.00235791,
2,Arizona,6931071,AZ,1.0,0.0,4.0,1.4e-05,0.0,5.8e-05,5,7.2e-05,0.0220286,
3,Arkansas,2988248,AR,3.0,1.0,3.0,0.0001,3.3e-05,0.0001,7,0.000234,0.00949735,
4,California,39250017,CA,30.0,31.0,17.0,7.6e-05,7.9e-05,4.3e-05,78,0.000199,0.124746,


In [None]:
for index,row in normalized_counts.iterrows():
    data = []
    num_1 = row['total_count']
    for x in range(num_1):
        data.append(1)
        
    state_pop = row['Total Population']
    num_2 = state_pop - num_1
    for x in range(num_2):
        data.append(0)
        
    result = stats.ttest_1samp(a=data, popmean=np.mean(pop))
    
    normalized_counts.set_value(index, 'P-Value', result[1])

In [None]:


normalized_counts

In [None]:
normalized_counts.to_csv("norm_with_p_values.csv", index=False)

In [None]:
x_axis = np.arange(45)
y_axis = normalized_counts['P-Value']

plt.figure(figsize=(20,2))

plt.plot(x_axis, y_axis, "--o", color='r')

plt.title("P-Values for Hypothesis Testing")
plt.xlabel("State")
plt.ylabel("p-value")

x_labels = normalized_counts['State']
x_locations = [value for value in np.arange(45)]
plt.xticks(x_locations, x_labels, rotation=90)

plt.savefig("Images/p-value.png", bbox_inches='tight')

plt.show()

In [None]:
normalized_counts.loc[normalized_counts['P-Value'] < 0.05]

In [None]:
normalized_counts.to_csv("norm_with_p_values.csv", index=False)