In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import scikit_posthocs as sp
import matplotlib.pyplot as plt

In [None]:
# dane pobrane z https://www.kaggle.com/datasets/sujaykapadnis/lets-do-some-coffee-tasting

df = pd.read_csv('GACTT_RESULTS_ANONYMIZED_v2.csv')

# Ocena danych i generowanie pytań 

In [None]:
for x in df.columns:
    print(x)

In [None]:
#zmiana opocji na wyświetlanie wszystkich kolumn
pd.set_option('display.max_columns', None)

In [None]:
df.tail(3)

In [None]:
df['Gender'].unique()

In [None]:
df['Employment Status'].unique()

In [None]:
df['What is your favorite coffee drink?'].unique()

In [None]:
df['Approximately how much have you spent on coffee equipment in the past 5 years?'].unique()

In [None]:
df['Lastly, how would you rate your own coffee expertise?'].unique()

In [None]:
df["Before today's tasting, which of the following best described what kind of coffee you like?"].unique()

In [None]:
df['Education Level'].unique()

In [None]:
df['Ethnicity/Race'].value_counts()

In [None]:
df['Political Affiliation'].value_counts()

In [None]:
df["What is the most you'd ever be willing to pay for a cup of coffee?"].value_counts()

In [None]:
# 1. Czy wraz z wiekiem rośnie ilość pitej kawy? 
# 2. Czy osoby o różnej etniczności piją różną ilość kawy? 
# 3. Czy osoby które posiadają droższy ekspres sądzą że mają większą wiedzę na temat kawy?
# 4. Czy republikanie bardziej niż demokraci wolą gorzką kawę? 
# 5. Czy kaukaskie kobiety są w stanie zapłacić za kawę więcej niż azjatyckie kobiety?

In [None]:
# 1. analiza korelacji - zmienne ilościowe? : "What is your age?", 
#                                             "How many cups of coffee do you typically drink per day?"
# pearson / spearman
#
# 2. porównanie grup - zmienna nominalna: "Ethnicity/Race" (3 największe grupy w zbiorze danych) , 
#                      zmienna ilościowa: "How many cups of coffee do you typically drink per day?"
# anova / kruskal-wallis
#
# 3. analiza korelacji - zmienna porządkowa: 'Lastly, how would you rate your own coffee expertise?'
#                        zmienna porządkowa: "Approximately how much have you spent on coffee equipment in the past 5 years?"
# spearman
#
# 4. porównanie grup - zmienna nominalna: "Political Affiliation",
#                     zmienna nominalna: "Do you usually add anything to your coffee? (Sugar or sweetener)"
# chi kwadrat
#
# 5. porównanie grup - zmienne jakościowe: "Gender", "Ethnicity/Race"
#                      zmienna porządkowa: "What is the most you'd ever be willing to pay for a cup of coffee?"
# t-test / mann-whitney
                        

# 1. Czy wraz z wiekiem rośnie ilość pitej kawy? 

In [None]:
df["What is your age?"].unique()

In [None]:
df["How many cups of coffee do you typically drink per day?"].unique()

In [None]:
df['age_recoded'] = df["What is your age?"].replace({'<18 years old':1, '18-24 years old':2, '25-34 years old':3, '35-44 years old':4,'45-54 years old':5,'55-64 years old':6,'>65 years old':7})

In [None]:
df['number_of_cups_recoded'] = df["How many cups of coffee do you typically drink per day?"].replace({'Less than 1':1, '1':2, '2':3, '3':4, '4':5, 'More than 4':6})

In [None]:
stats.spearmanr(df['age_recoded'], df['number_of_cups_recoded'], nan_policy='omit')

In [None]:
sns.regplot(data=df, x='age_recoded', y='number_of_cups_recoded')

In [None]:
ax = sns.regplot(data=df, x='age_recoded', y='number_of_cups_recoded', x_jitter = 0.25, y_jitter = 0.25, fit_reg=False)

opisy_wieku = ['<18','18-24','25-34','35-44','45-54','55-64','>65']
opisy_liczby_kubkow = ['Mniej niż 1', '1', '2', '3', '4', 'Więcej niż 4']

#  zmiana opisu osi x i y: (lista wartości, lista etykiet)
ax.set_xticks(range(1, 8), opisy_wieku)
ax.set_yticks(range(1, 7), opisy_liczby_kubkow)

plt.xlabel("Wiek")
plt.ylabel("Liczba kubków kawy")

# 2. Czy osoby o różnej etniczności piją różną ilość kawy? 


In [None]:
df["Ethnicity/Race"].value_counts()

In [None]:
np.array(df[df["Ethnicity/Race"] == 'White/Caucasian']["Ethnicity/Race"])

In [None]:
stats.kruskal(df[df["Ethnicity/Race"] == 'White/Caucasian']['number_of_cups_recoded'], 
              df[df["Ethnicity/Race"] == 'Asian/Pacific Islander']['number_of_cups_recoded'], 
              df[df["Ethnicity/Race"] == 'Hispanic/Latino']['number_of_cups_recoded'])

In [None]:
# stworzenie mniejszego DataFrame'u, która zawiera tylko trzy etniczności:

df_posthocs = df[df["Ethnicity/Race"].isin(['White/Caucasian', 'Asian/Pacific Islander', 'Hispanic/Latino'])]

# df["Ethnicity/Race"].isin(['White/Caucasian', 'Asian/Pacific Islander', 'Hispanic/Latino']) 
# równoważne 
# (df["Ethnicity/Race"] == 'White/Caucasian') | (df["Ethnicity/Race"] == 'Asian/Pacific Islander') | (df["Ethnicity/Race"] == 'Hispanic/Latino')

In [None]:
# potwierdzenie, że "Ethnicity/Race" zawiera tylko 3 wartości
df_posthocs["Ethnicity/Race"].unique()

In [None]:
# testy post hoc które pokazują czy istnieją istotne różnice między poszczególnymi parami 
sp.posthoc_dunn(df_posthocs, val_col='number_of_cups_recoded', group_col='Ethnicity/Race', p_adjust = 'bonferroni')

In [None]:
sns.boxplot(df_posthocs, x= "Ethnicity/Race", y="number_of_cups_recoded")

# atrybut medianprops={"color": "r", "linewidth": 2}

In [None]:
ax = sns.violinplot(df_posthocs, x= "Ethnicity/Race", y="number_of_cups_recoded")

'''
opisy_etnicznosci = ['Biały/Kaukaski', 'Azjatycki/Z wysp \nPacyfiku', 'Hiszpański/Latynoski']
opisy_liczby_kubkow = ['Mniej niż 1', '1', '2', '3', '4', 'Więcej niż 4']

#  zmiana opisu osi x i y: (lista wartości, lista etykiet)
ax.set_xticks(range(3), opisy_etnicznosci)
ax.set_yticks(range(1, 7), opisy_liczby_kubkow)

plt.xlabel("Etniczność",fontweight='bold')
plt.ylabel("Liczba kubków kawy",fontweight='bold')
'''

# 4. Czy republikanie wolą gorzką kawę? 

In [None]:
df["Political Affiliation"].unique()

In [None]:
df["Do you usually add anything to your coffee? (Sugar or sweetener)"].unique()

In [None]:
df_for_4 = df[(df["Political Affiliation"] == 'Democrat') | (df["Political Affiliation"] == 'Republican')]

In [None]:
crosstab = pd.crosstab(df_for_4["Political Affiliation"], df_for_4["Do you usually add anything to your coffee? (Sugar or sweetener)"])
crosstab

In [None]:
#     wartość testu chi^2, 
#     wartość p,
#     liczbę stopni swobody,
#     oczekiwane wartości,

stats.chi2_contingency(crosstab)

In [None]:
# sns.countplot(data= df_for_4, x="Political Affiliation", hue="Do you usually add anything to your coffee? (Sugar or sweetener)")

# 5. Czy kaukaskie kobiety są w stanie zapłacić za kawę więcej niż azjatyckie kobiety?

In [None]:
df["Gender"].unique()

In [None]:
df["Ethnicity/Race"].unique()

In [None]:
df["What is the most you'd ever be willing to pay for a cup of coffee?"].unique()

In [None]:
df['najwiecej_za_kawe'] = df["What is the most you'd ever be willing to pay for a cup of coffee?"].replace({'Less than $2':0,
                '$2-$4':1, '$4-$6':2,  '$6-$8':3, '$8-$10':4, '$10-$15':5, '$15-$20':6, 'More than $20':7 })

In [None]:
df[(df["Gender"] == 'Female') & (df["Ethnicity/Race"] == 'Asian/Pacific Islander')]['najwiecej_za_kawe']

In [None]:
stats.mannwhitneyu(df[(df["Gender"] == 'Female') & (df["Ethnicity/Race"] == 'Asian/Pacific Islander')]['najwiecej_za_kawe'],
                   df[(df["Gender"] == 'Female') & (df["Ethnicity/Race"] == 'White/Caucasian')]['najwiecej_za_kawe'], 
                   nan_policy='omit')

In [None]:
df_for_5 = df[(df["Gender"] == 'Female') & (df["Ethnicity/Race"].isin(['White/Caucasian','Asian/Pacific Islander']))]

In [None]:
ax =sns.boxplot(data=df_for_5, x="Ethnicity/Race", y='najwiecej_za_kawe')

slownik_replace = {'Mniej niż $2':0,'$2-$4':1, '$4-$6':2,  '$6-$8':3, '$8-$10':4, '$10-$15':5, '$15-$20':6, 'Więcej niż $20':7 }
wartosci = list(slownik_replace.values())
etykiety = list(slownik_replace.keys())

#  zmiana opisu osi x i y: (lista wartości, lista etykiet)
ax.set_yticks(wartosci, etykiety)

plt.ylabel("Max. cena za kawę",fontweight='bold')