In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from datetime import datetime

In [2]:
demgop = pd.read_csv('116th_congress.csv', parse_dates=True)

In [3]:
demgop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530 entries, 0 to 529
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Office        530 non-null    object
 1   Name          530 non-null    object
 2   Party         530 non-null    object
 3   Birth Date    530 non-null    object
 4   Date started  530 non-null    object
dtypes: object(5)
memory usage: 20.8+ KB


In [4]:
demgop.head()

Unnamed: 0,Office,Name,Party,Birth Date,Date started
0,Senate,Richard Shelby,Republican,05-06-1934,01-03-1987
1,Senate,Doug Jones,Democratic,05-04-1954,01-03-2018
2,Senate,Lisa Murkowski,Republican,05-22-1957,12-20-2002
3,Senate,Dan Sullivan,Republican,11-13-1964,01-03-2015
4,Senate,Kyrsten Sinema,Democratic,07-12-1976,01-03-2019


In [5]:
#Convert birth date and date started to datetime
demgop['Birth Date'] = demgop['Birth Date'].apply(pd.to_datetime)
demgop['Date started'] = demgop['Date started'].apply(pd.to_datetime)

In [6]:
#Calculate the difference between birth date and today, and the difference between date started and today
demgop['Age'] = pd.Timestamp.today().normalize() - demgop['Birth Date']
demgop['Time Served'] = pd.Timestamp.today().normalize() - demgop['Date started']
demgop.head()

Unnamed: 0,Office,Name,Party,Birth Date,Date started,Age,Time Served
0,Senate,Richard Shelby,Republican,1934-05-06,1987-01-03,31629 days,12394 days
1,Senate,Doug Jones,Democratic,1954-05-04,2018-01-03,24326 days,1071 days
2,Senate,Lisa Murkowski,Republican,1957-05-22,2002-12-20,23212 days,6564 days
3,Senate,Dan Sullivan,Republican,1964-11-13,2015-01-03,20480 days,2167 days
4,Senate,Kyrsten Sinema,Democratic,1976-07-12,2019-01-03,16221 days,706 days


In [7]:
dem_gop_time = demgop[['Party', 'Age', 'Time Served']]
dem_gop_time.head()

Unnamed: 0,Party,Age,Time Served
0,Republican,31629 days,12394 days
1,Democratic,24326 days,1071 days
2,Republican,23212 days,6564 days
3,Republican,20480 days,2167 days
4,Democratic,16221 days,706 days


In [8]:
#Subset Republicans and Democrats (counting the two Independents as Democrats)
gop = dem_gop_time[(dem_gop_time["Party"] == 'Republican')]
gop.head()

Unnamed: 0,Party,Age,Time Served
0,Republican,31629 days,12394 days
2,Republican,23212 days,6564 days
3,Republican,20480 days,2167 days
5,Republican,19986 days,706 days
6,Republican,25567 days,3628 days


In [10]:
dem_ind = dem_gop_time[(dem_gop_time["Party"] == 'Democratic') | (dem_gop_time["Party"] == 'Independent')]
dem_ind.head()

Unnamed: 0,Party,Age,Time Served
1,Democratic,24326 days,1071 days
4,Democratic,16221 days,706 days
8,Democratic,31947 days,10256 days
9,Democratic,20504 days,1436 days
10,Democratic,20465 days,4340 days


In [23]:
#Normalize age and time served for Republicans
norm_gop = preprocessing.normalize(gop[['Age', 'Time Served']], axis=0)

#Convert array to DataFrame
norm_gop = pd.DataFrame(norm_gop)

#Rename column heads
norm_gop.rename(columns={0:'Age (R)', 1:'Time Served (R)'}, inplace=True)
norm_gop.head()

Unnamed: 0,Age (R),Time Served (R)
0,0.172233,0.040355
1,0.126396,0.021369
2,0.111518,0.00705
3,0.108828,0.002293
4,0.13922,0.011808


In [24]:
#Normalize age and time served for Democrats
norm_dem = preprocessing.normalize(dem_ind[['Age', 'Time Served']], axis=0)

#Convert array to DataFrame
norm_dem = pd.DataFrame(norm_dem)

#Rename column heads
norm_dem.rename(columns={0:'Age (D)', 1:'Time Served (D)'}, inplace=True)
norm_dem.head()

Unnamed: 0,Age (D),Time Served (D)
0,0.132007,0.003102
1,0.088021,0.002043
2,0.173366,0.02975
3,0.111265,0.00416
4,0.111053,0.012586


Null hypothesis: There is no statistically significant difference between the mean ages of Republicans and Democrats.<br><br>
Alternate hypothesis: There is a statistically significant difference between the mean ages of Republicans and Democrats.

In [26]:
res_age = stats.ttest_ind(norm_gop['Age (R)'], norm_dem['Age (D)'])
display(res_age)

Ttest_indResult(statistic=0.3206021804515866, pvalue=0.7486389703899009)

Null hypothesis: There is no statistically significant difference between the mean time served of Republicans and Democrats.<br><br>
Alternate hypothesis: There is a statistically significant difference between the mean time served of Republicans and Democrats.

In [27]:
res_term = stats.ttest_ind(norm_gop['Time Served (R)'], norm_dem['Time Served (D)'])
display(res_term)

Ttest_indResult(statistic=1.12037329707839, pvalue=0.26306502934505566)

The p-values from comparing the means between the parties of time served and age are high enough that the null hypotheses cannot be rejected. There is no statistically significant difference between the means of the ages and time served of the parties' members in the 116th Congress. The null hypothesis of time served is closer to being rejected than the null hypothesis of ages.