In [74]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

anes96 = sm.datasets.anes96
print(anes96.NOTE)

dataset_anes96 = anes96.load_pandas()
df = dataset_anes96.data

::

    Number of observations - 944
    Number of variables - 10

    Variables name definitions::

            popul - Census place population in 1000s
            TVnews - Number of times per week that respondent watches TV news.
            PID - Party identification of respondent.
                0 - Strong Democrat
                1 - Weak Democrat
                2 - Independent-Democrat
                3 - Independent-Indpendent
                4 - Independent-Republican
                5 - Weak Republican
                6 - Strong Republican
            age : Age of respondent.
            educ - Education level of respondent
                1 - 1-8 grades
                2 - Some high school
                3 - High school graduate
                4 - Some college
                5 - College degree
                6 - Master's degree
                7 - PhD
            income - Income of household
                1  - None or less than $2,999
                2  - $3,000-$4,9

Q1DataFrame Basic Properties Exercise
Our DataFrame (df) contains data on registered voters in the United States, including demographic information and political preference. Using pandas, print the first 5 rows of the DataFrame to get a sense of what the data looks like. Next, answer the following questions:

How many observations are in the DataFrame?
How many variables are measured (how many columns)?
What is the age of the youngest person in the data? The oldest?
How many days a week does the average respondent watch TV news (round to the nearest tenth)?
Check for missing values. Are there any?

In [75]:
print(df.head(5))
print(df.shape)
print(len(df.columns))
print(len(df))

   popul  TVnews  selfLR  ClinLR  DoleLR  PID   age  educ  income  vote  \
0    0.0     7.0     7.0     1.0     6.0  6.0  36.0   3.0     1.0   1.0   
1  190.0     1.0     3.0     3.0     5.0  1.0  20.0   4.0     1.0   0.0   
2   31.0     7.0     2.0     2.0     6.0  1.0  24.0   6.0     1.0   0.0   
3   83.0     4.0     3.0     4.0     5.0  1.0  28.0   6.0     1.0   0.0   
4  640.0     7.0     5.0     6.0     4.0  0.0  68.0   6.0     1.0   0.0   

   logpopul  
0 -2.302585  
1  5.247550  
2  3.437208  
3  4.420045  
4  6.461624  
(944, 11)
11
944


In [76]:
print("youngest person in dataset is ",min(df.age) , "oldest person in dataset is ",max(df.age))

youngest person in dataset is  19.0 oldest person in dataset is  91.0


In [77]:
print(df.TVnews.mean())

3.7277542372881354


In [78]:
null_values=df.isnull().sum().sum()
if null_values==0:
    print("there is no null value in dataset")
else:
    print("there are null vallues",null_values)


there is no null value in dataset


Q2Data Processing Exercise
We want to adjust the dataset for our use. Do the following:

Rename the educ column education.
Create a new column called party based on each respondent's answer to PID. party should equal Democrat if the respondent selected either Strong Democrat or Weak Democrat. party will equal Republican if the respondent selected Strong or Weak Republican for PID and Independent if they selected anything else.
Create a new column called age_group that buckets respondents into the following categories based on their age: 18-24, 25-34, 35-44, 45-54, 55-64, and 65 and over.


In [89]:

df = df.rename(columns={'educ': 'education'})
def get_party(df):
  if df['PID'] < 2:
    return "Democrat"
  elif df['PID'] > 4:
    return "Republican"
  else:
    return "Independent"
df['party'] = df.apply(get_party, axis = 1)
def get_agegroup(df):
  if df['age'] < 25:
    return "18-24"
  elif df['age'] < 35:
    return "25-34"
  elif df['age'] < 45:
    return "35-44"
  elif df['age'] < 55:
    return "45-54"
  elif df['age'] < 65:
    return "55-64"
  else:
    return "65 and over"
df['age_group'] = df.apply(get_agegroup, axis = 1)





''

Q3. Filtering Data Exercise
Use the filtering method to find all the respondents who have the impression that Bill Clinton is moderate or conservative (ClinLR equals 4 or higher). How many respondents are in this subset?

Among these respondents, how many have a household income less than $50,000 and attended at least some college?


In [90]:
filtered_df=df[df['ClinLR']>=4]
filtered_df
print("number of respndedent with impression that bill clinton is moderate orconservative ", len(filtered_df))
filter_respondent=filtered_df[(filtered_df['income']<20) & (filtered_df.education>3)]
print(f"Among these respondents, {len(filter_respondent)} have a household income less than $50,000 and attended at least some college")

number of respndedent with impression that bill clinton is moderate orconservative  282
Among these respondents, 98 have a household income less than $50,000 and attended at least some college


4. Calculating From Data Exercise
For each of the below match-ups, choose the group that is more likely to vote for Bill Clinton. You can calculate this using the percentage of each group that intends to vote for Clinton (vote).

Another way to think about this: Given that a respondent is a Democrat, there is a ____ percent chance they will vote for Clinton. How does this value change if the respondent is a Republican?

Which match-up was the closest? Which had the biggest difference?

Democrats or Republicans
People younger than 44 or People 44 and older
People who watch TV news at least 6 days a week or People who watch TV news less than 3 days a week
People who live somewhere with a population greater than the average respondent or People who live in a place with a population equal to or less than the average respondent

In [91]:
df['younger44'] = np.where(df['age'] < 44, True, False)
def get_TVnews(df):
  if df['TVnews'] > 5:
    return ">5"
  if df['TVnews'] < 3:
    return "<3"
  else:
    return "3-5"
df['TVnews_category'] = df.apply(get_TVnews, axis = 1)
avg_population = df.popul.mean()
df['popul_greater'] = np.where(df['popul'] > avg_population, True, False)

In [92]:
def match_ups(Column, ConditionA, ConditionB):
  all_A = df[(df[Column] == ConditionA)]
  clinton_A = all_A[(all_A['vote'] == 0)]
  percent_A = (len(clinton_A) / len(all_A)) * 100

  all_B = df[(df[Column] == ConditionB)]
  clinton_B = all_B[(all_B['vote'] == 0)]
  percent_B = (len(clinton_B) / len(all_B)) * 100

  print(f"{ConditionA} = {percent_A:.2f}%")
  print(f"{ConditionB} = {percent_B:.2f}%")
  print(f"Difference ({ConditionA} minus {ConditionB}) = {percent_A - percent_B:.2f} percent points.")

In [93]:
print("Democrats vs Republicans")
match_ups("party", "Democrat", "Republican")

print("\nYounger than 44 (True) vs 44 and Older (False)")
match_ups("younger44", True, False)

print("\nWatch TV news 6+ days a week (>5) vs Watch TV news less than 3 days a week (<3)")
match_ups("TVnews_category", ">5", "<3")

print("\nLive somewhere more populous than the average respondent (True) vs Live somewhere less populous (False")
match_ups("popul_greater", True, False)

Democrats vs Republicans
Democrat = 96.32%
Republican = 10.46%
Difference (Democrat minus Republican) = 85.85 percent points.

Younger than 44 (True) vs 44 and Older (False)
True = 59.48%
False = 57.29%
Difference (True minus False) = 2.19 percent points.

Watch TV news 6+ days a week (>5) vs Watch TV news less than 3 days a week (<3)
>5 = 57.81%
<3 = 55.50%
Difference (>5 minus <3) = 2.32 percent points.

Live somewhere more populous than the average respondent (True) vs Live somewhere less populous (False
True = 72.34%
False = 55.92%
Difference (True minus False) = 16.43 percent points.


5. Grouping Data Exercise
Use the groupby() method to bucket respondents by age_group. Which age group is the most conservative? Which watches TV news the least?

Next, calculate 5 percentile groups based on income. Group the dataset by these percentiles. Which income bracket is the most liberal? Which is the most conservative? The oldest? Highest educated?

In [99]:
avg_by_age=df.groupby(['age_group'])
print(avg_by_age['selfLR'].mean())
print(avg_by_age['TVnews'].mean())


age_group
18-24          4.000000
25-34          4.217391
35-44          4.257143
45-54          4.285714
55-64          4.532258
65 and over    4.529412
Name: selfLR, dtype: float64
age_group
18-24          2.358491
25-34          2.570652
35-44          3.118367
45-54          3.916667
55-64          4.516129
65 and over    5.523529
Name: TVnews, dtype: float64


In [105]:
income_quants = list(df['income'].quantile(q=[0.2, 0.4, 0.6, 0.8, 1]))
def get_income_quant(df):
  if df['income'] <= income_quants[0]:
    return "Q1"
  elif df['income'] <= income_quants[1]:
    return "Q2"
  elif df['income'] <= income_quants[2]:
    return "Q3"
  elif df["income"] <= income_quants[3]:
    return "Q4"
  else:
    return "Q5"
df['income_quant'] = df.apply(get_income_quant, axis=1)

avg_by_income=df.groupby(['income_quant'])
avg_by_income['selfLR'].mean()


income_quant
Q1    4.282297
Q2    4.295567
Q3    4.099379
Q4    4.497537
Q5    4.422619
Name: selfLR, dtype: float64

In [108]:
avg_by_income['education'].mean()


income_quant
Q1    3.722488
Q2    4.246305
Q3    4.571429
Q4    4.862069
Q5    5.636905
Name: education, dtype: float64

In [109]:
avg_by_income['age'].mean()

income_quant
Q1    50.009569
Q2    48.266010
Q3    45.583851
Q4    45.128079
Q5    45.589286
Name: age, dtype: float64

6. Voting Across the Aisle
We are interested in learning more about respondents who's political views differ strongly from the candidate they expect to vote for. Using selfLR, vote, ClinLR, and DoleLR, work through the following questions. Your interpretation may differ from the answer key.

What is the largest recorded difference between a respondent's political leaning and their impression of their intended candidate's political leaning?
How many respondents exhibit a difference of that magnitude?
Make a separate DataFrame called sway that only includes voters who exhibit a difference greater than |3|.
Among those in sway, are respondents more likely to be voting for a candidate more conservative or more liberal than their own political leaning?
In sway, which candidate is the more popular choice?

In [110]:

def make_sway(df):
  if df['vote'] == 0:
    return df['selfLR'] - df['ClinLR']
  else:
    return df['selfLR'] - df['DoleLR']
df['sway_diff'] = df.apply(make_sway, axis=1)
df.sort_values(by='sway_diff', ascending=False, key=abs)


Unnamed: 0,popul,TVnews,selfLR,ClinLR,DoleLR,PID,age,education,income,vote,logpopul,party,age_group,younger44,TVnews_category,popul_greater,income_quant,sway_diff
166,2.0,7.0,7.0,1.0,2.0,0.0,62.0,2.0,11.0,0.0,0.741937,Democrat,55-64,False,>5,False,Q1,6.0
638,9.0,3.0,7.0,2.0,5.0,4.0,46.0,3.0,20.0,0.0,2.208274,Independent,45-54,False,3-5,False,Q4,5.0
77,32.0,5.0,3.0,7.0,4.0,1.0,65.0,1.0,5.0,0.0,3.468856,Democrat,65 and over,False,3-5,False,Q1,-4.0
147,720.0,5.0,1.0,5.0,6.0,1.0,64.0,6.0,10.0,0.0,6.579390,Democrat,55-64,False,3-5,True,Q1,-4.0
920,3500.0,7.0,7.0,3.0,5.0,4.0,34.0,7.0,24.0,0.0,8.160547,Independent,25-34,True,>5,True,Q5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,290.0,6.0,5.0,1.0,5.0,5.0,49.0,6.0,20.0,1.0,5.670226,Republican,45-54,False,>5,False,Q4,0.0
258,31.0,4.0,6.0,2.0,6.0,4.0,66.0,4.0,14.0,1.0,3.437208,Independent,65 and over,False,3-5,False,Q2,0.0
644,310.0,7.0,5.0,2.0,5.0,1.0,56.0,6.0,20.0,1.0,5.736895,Democrat,55-64,False,>5,True,Q4,0.0
643,33.0,7.0,4.0,3.0,4.0,2.0,40.0,6.0,20.0,1.0,3.499533,Independent,35-44,True,>5,False,Q4,0.0


In [111]:
sway = df[df['sway_diff'].abs() >= 3]
print(sway.describe())
print("\nAmong the people with the largest `sway_diff`:")
print(f"{len(sway[sway['sway_diff'] > 0])} respondents perceive their candidate as more conservative than them.")
print(f"{len(sway[sway['sway_diff'] < 0])} respondents perceive their candidate as more liberal than them.")
print(f"{len(sway[sway['vote'] == 0])} respondents are voting for Clinton.")
print(f"{len(sway[sway['vote'] == 1])} respondents are voting for Dole.")

             popul     TVnews     selfLR     ClinLR     DoleLR        PID  \
count    56.000000  56.000000  56.000000  56.000000  56.000000  56.000000   
mean    396.178571   3.553571   4.660714   3.089286   4.642857   2.160714   
std    1131.227895   2.916534   1.528835   1.831808   1.823369   2.016088   
min       0.000000   0.000000   1.000000   1.000000   1.000000   0.000000   
25%       1.750000   0.000000   4.000000   2.000000   4.000000   0.000000   
50%      47.500000   3.500000   5.000000   2.000000   5.000000   2.000000   
75%     217.500000   7.000000   6.000000   4.000000   6.000000   4.000000   
max    7300.000000   7.000000   7.000000   7.000000   7.000000   6.000000   

             age  education     income       vote   logpopul  sway_diff  
count  56.000000  56.000000  56.000000  56.000000  56.000000  56.000000  
mean   45.964286   4.071429  15.000000   0.142857   3.037540   1.392857  
std    16.590210   1.704082   5.746145   0.353094   3.332730   2.933683  
min    20.