# Reading in the FiveThirtyEight Dataset

In [77]:
import pandas as pd
data=pd.read_csv("thanksgiving-2015-poll-data.csv", encoding="Latin-1")
data.head() #Printing the first 5 rows to verify

Unnamed: 0,RespondentID,Do you celebrate Thanksgiving?,What is typically the main dish at your Thanksgiving dinner?,What is typically the main dish at your Thanksgiving dinner? - Other (please specify),How is the main dish typically cooked?,How is the main dish typically cooked? - Other (please specify),What kind of stuffing/dressing do you typically have?,What kind of stuffing/dressing do you typically have? - Other (please specify),What type of cranberry saucedo you typically have?,What type of cranberry saucedo you typically have? - Other (please specify),...,Have you ever tried to meet up with hometown friends on Thanksgiving night?,"Have you ever attended a ""Friendsgiving?""",Will you shop any Black Friday sales on Thanksgiving Day?,Do you work in retail?,Will you employer make you work on Black Friday?,How would you describe where you live?,Age,What is your gender?,How much total combined money did all members of your HOUSEHOLD earn last year?,US Region
0,4337954960,Yes,Turkey,,Baked,,Bread-based,,,,...,Yes,No,No,No,,Suburban,18 - 29,Male,"$75,000 to $99,999",Middle Atlantic
1,4337951949,Yes,Turkey,,Baked,,Bread-based,,Other (please specify),Homemade cranberry gelatin ring,...,No,No,Yes,No,,Rural,18 - 29,Female,"$50,000 to $74,999",East South Central
2,4337935621,Yes,Turkey,,Roasted,,Rice-based,,Homemade,,...,Yes,Yes,Yes,No,,Suburban,18 - 29,Male,"$0 to $9,999",Mountain
3,4337933040,Yes,Turkey,,Baked,,Bread-based,,Homemade,,...,Yes,No,No,No,,Urban,30 - 44,Male,"$200,000 and up",Pacific
4,4337931983,Yes,Tofurkey,,Baked,,Bread-based,,Canned,,...,Yes,No,No,No,,Urban,30 - 44,Male,"$100,000 to $124,999",Pacific


In [9]:
data.columns #Printing a list of the columns

Index(['RespondentID', 'Do you celebrate Thanksgiving?',
       'What is typically the main dish at your Thanksgiving dinner?',
       'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
       'How is the main dish typically cooked?',
       'How is the main dish typically cooked? - Other (please specify)',
       'What kind of stuffing/dressing do you typically have?',
       'What kind of stuffing/dressing do you typically have? - Other (please specify)',
       'What type of cranberry saucedo you typically have?',
       'What type of cranberry saucedo you typically have? - Other (please specify)',
       'Do you typically have gravy?',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
       'Which of these side dishes aretypically served

# Computing the number of people who celebrate Thanksgiving

In [10]:
data["Do you celebrate Thanksgiving?"].value_counts() 

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64

# Removing rows of people who don't celebrate Thanksgiving, from the Dataset

In [12]:
data=data[data["Do you celebrate Thanksgiving?"]=="Yes"]

# Finding the distribution of the main dish at Thanksgiving dinner

In [13]:
data["What is typically the main dish at your Thanksgiving dinner?"].value_counts()

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64

# Checking whether people who eat Tofurkey as the main dish, have gravy or not

In [78]:
data["Do you typically have gravy?"][data["What is typically the main dish at your Thanksgiving dinner?"]=="Tofurkey"].value_counts()

Yes    12
No      8
Name: Do you typically have gravy?, dtype: int64

# Finding the number of people who have (False) and don't have (True) any pies at all

In [20]:
apple_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"])
pumpkin_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"])
pecan_isnull=pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"])
no_pies=apple_isnull & pumpkin_isnull & pecan_isnull
no_pies.value_counts()

False    876
True     104
dtype: int64

# Computing statistics on the ages of people who celebrate Thanksgiving
-----
**The "Age" column had values "18-29", "30-44", "45-59", "60+", and so the lower bound of each value, i.e. 18,30,45,60 is to be representative of the whole range**

In [26]:
def age_to_int(ele):
    if pd.isnull(ele):
        return None
    elif ele.endswith("+"):
        age=int(ele.split("+")[0])
    else:
        age=int(ele.split(" ")[0])
    return age
data["int_age"]=data["Age"].apply(age_to_int)
data["int_age"].describe()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

# Computing statistics on the annual incomes of people who celebrate Thanksgiving
-----
**The column had values**
- "$0 to $9,999" 
- "$10,000 to $24,999"
- "$25,000 to $49,999"
- "$50,000 to $74,999"
- "$75,000 to $99,999"
- "$100,000 to $124,999"
- "$125,000 to $149,999"
- "$150,000 to $174,999"
- "$175,000 to $199,999"
- "$200,000 and up",
and so the lower bound of each value, i.e. 18,30,45,60 is be representative of the whole range**

In [35]:
def inc_to_int(ele):
    if pd.isnull(ele):
        return None
    elif ele.split(" ")[0] == "Prefer":
        return None
    else:
        inc=ele.split(" ")[0]
        inc=inc[1:]
        inc=inc.replace(",","")
    return int(inc)
data["int_income"]=data["How much total combined money did all members of your HOUSEHOLD earn last year?"].apply(inc_to_int)
data["int_income"].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64

# Computing how far people with income < 150,000 will travel for Thanksgiving

In [36]:
data[data["int_income"]<150000]["How far will you travel for Thanksgiving?"].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64

# Computing how far people with income > 150,000 will travel for Thanksgiving

In [37]:
data[data["int_income"]>150000]["How far will you travel for Thanksgiving?"].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several hours or fly      12
Name: How far will you travel for Thanksgiving?, dtype: int64

# Finding the link between age and celebrating Thanksgiving with friends

In [39]:
import numpy as np
data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"', values="int_age", aggfunc=np.mean)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


# Finding the link between income and celebrating Thanksgiving with friends

In [40]:
data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"', values="int_income", aggfunc=np.mean)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,78914.549654,72894.736842
Yes,78750.0,66019.736842


# Finding the most commonly eaten dessert

In [66]:
desserts=["Apple cobbler", "Blondies", "Brownies", "Carrot cake", "Cheesecake", "Cookies", "Fudge", "Ice cream", "Peach cobbler", "None", "Other (please specify).1"]
dessert_count={}
for ele in desserts:
    string="Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - "+ele
    ser=data[string].value_counts()
    for idx,val in enumerate(ser.index):
        dessert_count[val]=ser[idx]
dessert_count
    

{'Apple cobbler': 110,
 'Banana Pudding': 1,
 'Banana bread': 1,
 'Berry cobbler': 1,
 'Blondies': 16,
 'Blueberry Cream Cheese Cake': 1,
 'Brownies': 128,
 'Cake (not carrot)': 1,
 'Carrot cake': 72,
 'Cheesecake': 191,
 'Choc cake': 1,
 'Chocolate Cake': 1,
 'Chocolate Cake with Marshmellows, Caremal Cake': 1,
 'Chocolate Oreo pudding cake': 1,
 'Chocolate trifle, bread pudding ': 1,
 'Cookies': 204,
 'Date balls': 1,
 'Flan': 2,
 'Fruit salad': 1,
 'Fudge': 43,
 'German Chocolate cake': 1,
 'Green cake': 1,
 'Homemade Pumpkin Pie  and Pecan Pie': 1,
 "I don't know": 1,
 'Ice cream': 266,
 'Jelly roll, sweet cheeseball, chocolate dipped berries': 1,
 'Just pie': 1,
 'Lefse': 1,
 'Lefse or Krumkakke': 1,
 'Lemon Pound, Sour Cream Pound, Rum Cake': 1,
 'None': 295,
 'Other bars': 1,
 'PIES': 1,
 'PUMPKIN PIE, OF COURSE': 1,
 'Peach cobbler': 103,
 'Pecan and pumpkin pie': 1,
 'Pecan pie': 1,
 'Pie': 12,
 'Pie is dessert': 1,
 'Pie only': 1,
 'Pie, pumpkin': 1,
 'Pies': 3,
 'Pies and dr

In [73]:
print(dessert_count.get(max(dessert_count.values())))

None


# Computing the number of people who will be working on Black Friday

In [76]:
data["Will you employer make you work on Black Friday?"].value_counts()

Yes              43
No               20
Doesn't apply     7
Name: Will you employer make you work on Black Friday?, dtype: int64