# Coffee Choice prediction

## Preprocessing the data

In [65]:
# Import the libraries
import pandas as pd 
import numpy as np
import math
import matplotlib.pyplot as plt


In [66]:
# Loading the data
data= pd.read_csv("GACTT_RESULTS_ANONYMIZED_v2.csv")
print(data.columns)
print(data.ndim)
print(data.head(4))

Index(['Submission ID', 'What is your age?',
       'How many cups of coffee do you typically drink per day?',
       'Where do you typically drink coffee?',
       'Where do you typically drink coffee? (At home)',
       'Where do you typically drink coffee? (At the office)',
       'Where do you typically drink coffee? (On the go)',
       'Where do you typically drink coffee? (At a cafe)',
       'Where do you typically drink coffee? (None of these)',
       'How do you brew coffee at home?',
       ...
       'Approximately how much have you spent on coffee equipment in the past 5 years?',
       'Do you feel like you’re getting good value for your money with regards to your coffee equipment?',
       'Gender', 'Gender (please specify)', 'Education Level',
       'Ethnicity/Race', 'Ethnicity/Race (please specify)',
       'Employment Status', 'Number of Children', 'Political Affiliation'],
      dtype='object', length=113)
2
  Submission ID What is your age?  \
0        gMR29l   18

**Filtering the data based on the coloumn "Lastly, what was your favorite overall coffee?"**

In [67]:
names=data.columns.to_list()
print(names)
print(data.dtypes)


['Submission ID', 'What is your age?', 'How many cups of coffee do you typically drink per day?', 'Where do you typically drink coffee?', 'Where do you typically drink coffee? (At home)', 'Where do you typically drink coffee? (At the office)', 'Where do you typically drink coffee? (On the go)', 'Where do you typically drink coffee? (At a cafe)', 'Where do you typically drink coffee? (None of these)', 'How do you brew coffee at home?', 'How do you brew coffee at home? (Pour over)', 'How do you brew coffee at home? (French press)', 'How do you brew coffee at home? (Espresso)', 'How do you brew coffee at home? (Coffee brewing machine (e.g. Mr. Coffee))', 'How do you brew coffee at home? (Pod/capsule machine (e.g. Keurig/Nespresso))', 'How do you brew coffee at home? (Instant coffee)', 'How do you brew coffee at home? (Bean-to-cup machine)', 'How do you brew coffee at home? (Cold brew)', 'How do you brew coffee at home? (Coffee extract (e.g. Cometeer))', 'How do you brew coffee at home? (O

In [68]:

missing_value=data.isnull().sum()['Lastly, what was your favorite overall coffee?']
print("Missing Value:",missing_value)
data=data.dropna(subset=["Lastly, what was your favorite overall coffee?"])
print("Data shape:",data.shape)

Missing Value: 272
Data shape: (3770, 113)


In [69]:
flavouring_missing_count=data.isnull().sum()['What kind of flavorings do you add?']
print(flavouring_missing_count)
data=data.loc[:,~data.columns.str.contains('What kind of flavorings do you add?', case=True)] 
print("Data Shape:",data.shape)

3770
Data Shape: (3770, 106)


In [70]:
# Calculate the percentage of missing values for each column
missing_values = data.isnull().sum() / len(data)
missing_values=missing_values[missing_values > 0.6]
# Create a DataFrame with column names and missing values percentage
missing_values_table = pd.DataFrame({'Column Name': missing_values.index, 'Percentage of Missing Values': missing_values.values})

# Sort the DataFrame by missing values percentage in descending order
missing_values_table = missing_values_table.sort_values(by='Percentage of Missing Values', ascending=False)
Missing_value=pd.DataFrame(missing_values_table)
# Print the DataFrame
print(Missing_value)
Missing_value.to_csv("missing_percentage.csv")

                                          Column Name  \
20                   What other flavoring do you use?   
22                            Gender (please specify)   
8                  Where else do you purchase coffee?   
10               What else do you add to your coffee?   
23                    Ethnicity/Race (please specify)   
9   Please specify what your favorite coffee drink is   
21                   Other reason for drinking coffee   
11        What kind of sugar or sweetener do you add?   
12  What kind of sugar or sweetener do you add? (G...   
16  What kind of sugar or sweetener do you add? (S...   
19  What kind of sugar or sweetener do you add? (R...   
18  What kind of sugar or sweetener do you add? (B...   
17  What kind of sugar or sweetener do you add? (A...   
13  What kind of sugar or sweetener do you add? (A...   
14  What kind of sugar or sweetener do you add? (H...   
15  What kind of sugar or sweetener do you add? (M...   
0                How else do yo

In [71]:
# Calculate the percentage of missing values for each column
missing_values = data.isnull().sum() / len(data)

# Drop columns with missing values greater than 60%
data = data.dropna(thresh=0.4)

print(data)

     Submission ID What is your age?  \
15          Zd694B     <18 years old   
16          QAeYZY     >65 years old   
17          QA5JYA   25-34 years old   
19          jyDqva   18-24 years old   
34          ylqbBg   45-54 years old   
...            ...               ...   
4037        PA44VP     >65 years old   
4038        vNgpPD     >65 years old   
4039        g5ggRM   18-24 years old   
4040        rlgbDN   25-34 years old   
4041        0EGYe9   25-34 years old   

     How many cups of coffee do you typically drink per day?  \
15                                                    3        
16                                                    3        
17                                                    1        
19                                                    2        
34                                                    2        
...                                                 ...        
4037                                                  2        
4038   

In [72]:
# Create a list of columns to drop
columns_to_drop = ['Coffee D - Notes', 'Coffee A - Notes', 'Coffee B - Notes', 'Coffee C - Notes',"Where do you typically drink coffee?","How do you brew coffee at home?","On the go, where do you typically purchase coffee?","Do you usually add anything to your coffee?","What kind of dairy do you add?","What kind of sugar or sweetener do you add?","Why do you drink coffee?","How else do you brew coffee at home?","Please specify what your favorite coffee drink is","What else do you add to your coffee?","What other flavoring do you use?","Gender (please specify)","Where else do you purchase coffee?","Other reason for drinking coffee","Ethnicity/Race (please specify)","Number of Children"]

# Drop the specified columns
data = data.drop(columns=columns_to_drop)
print("Data Shape:",data.shape)

Data Shape: (3770, 86)


## Feature Engineering

In [73]:
# Columns with binary responses ( 'True' and 'False' are the values)
binary_columns = ['Where do you typically drink coffee? (At home)',
       'Where do you typically drink coffee? (At the office)',
       'Where do you typically drink coffee? (On the go)',
       'Where do you typically drink coffee? (At a cafe)','Do you like the taste of coffee?', 'Do you know where your coffee comes from?', 'Do you feel like you’re getting good value for your money when you buy coffee at a cafe?', 'Do you feel like you’re getting good value for your money with regards to your coffee equipment?',
       'Where do you typically drink coffee? (None of these)','How do you brew coffee at home? (Pour over)', 'How do you brew coffee at home? (French press)', 'How do you brew coffee at home? (Espresso)', 'How do you brew coffee at home? (Coffee brewing machine (e.g. Mr. Coffee))', 'How do you brew coffee at home? (Pod/capsule machine (e.g. Keurig/Nespresso))', 'How do you brew coffee at home? (Instant coffee)', 'How do you brew coffee at home? (Bean-to-cup machine)', 'How do you brew coffee at home? (Cold brew)', 'How do you brew coffee at home? (Coffee extract (e.g. Cometeer))', 'How do you brew coffee at home? (Other)','On the go, where do you typically purchase coffee? (National chain (e.g. Starbucks, Dunkin))', 'On the go, where do you typically purchase coffee? (Local cafe)', 'On the go, where do you typically purchase coffee? (Drive-thru)', 'On the go, where do you typically purchase coffee? (Specialty coffee shop)', 'On the go, where do you typically purchase coffee? (Deli or supermarket)', 'On the go, where do you typically purchase coffee? (Other)','Do you usually add anything to your coffee? (No - just black)', 'Do you usually add anything to your coffee? (Milk, dairy alternative, or coffee creamer)', 'Do you usually add anything to your coffee? (Sugar or sweetener)', 'Do you usually add anything to your coffee? (Flavor syrup)', 'Do you usually add anything to your coffee? (Other)','What kind of dairy do you add? (Whole milk)', 'What kind of dairy do you add? (Skim milk)', 'What kind of dairy do you add? (Half and half)', 'What kind of dairy do you add? (Coffee creamer)', 'What kind of dairy do you add? (Flavored coffee creamer)', 'What kind of dairy do you add? (Oat milk)', 'What kind of dairy do you add? (Almond milk)', 'What kind of dairy do you add? (Soy milk)', 'What kind of dairy do you add? (Other)','What kind of sugar or sweetener do you add? (Granulated Sugar)', 'What kind of sugar or sweetener do you add? (Artificial Sweeteners (e.g., Splenda))', 'What kind of sugar or sweetener do you add? (Honey)', 'What kind of sugar or sweetener do you add? (Maple Syrup)', 'What kind of sugar or sweetener do you add? (Stevia)', 'What kind of sugar or sweetener do you add? (Agave Nectar)', 'What kind of sugar or sweetener do you add? (Brown Sugar)', 'What kind of sugar or sweetener do you add? (Raw Sugar (Turbinado))','Why do you drink coffee? (It tastes good)', 'Why do you drink coffee? (I need the caffeine)', 'Why do you drink coffee? (I need the ritual)', 'Why do you drink coffee? (It makes me go to the bathroom)', 'Why do you drink coffee? (Other)'
       ]

# Replace 'True' and 'False' with 1 and 0, respectively
data[binary_columns] = data[binary_columns].astype(bool)



print(data.head(2))
print(data.dtypes)

   Submission ID What is your age?  \
15        Zd694B     <18 years old   
16        QAeYZY     >65 years old   

   How many cups of coffee do you typically drink per day?  \
15                                                  3        
16                                                  3        

    Where do you typically drink coffee? (At home)  \
15                                            True   
16                                           False   

    Where do you typically drink coffee? (At the office)  \
15                                               True      
16                                               True      

    Where do you typically drink coffee? (On the go)  \
15                                             False   
16                                             False   

    Where do you typically drink coffee? (At a cafe)  \
15                                              True   
16                                              True   

    Where do you

In [74]:
category_coloumns=["What is your age?","What is your favorite coffee drink?","Before today's tasting, which of the following best described what kind of coffee you like?","How much caffeine do you like in your coffee?","What is the most you've ever paid for a cup of coffee?","Do you work from home or in person?","Lastly, how would you rate your own coffee expertise?","Lastly, what was your favorite overall coffee?","In total, much money do you typically spend on coffee in a month?","What is the most you'd ever be willing to pay for a cup of coffee?","Approximately how much have you spent on coffee equipment in the past 5 years?","Gender","Education Level","Ethnicity/Race","Employment Status","Political Affiliation","How many cups of coffee do you typically drink per day?","How strong do you like your coffee?","What roast level of coffee do you prefer?"
                   ,"Between Coffee A, Coffee B, and Coffee C which did you prefer?","Between Coffee A and Coffee D, which did you prefer?",
                   "Coffee A - Bitterness","Coffee A - Acidity","Coffee A - Personal Preference","Coffee B - Bitterness","Coffee B - Acidity","Coffee B - Personal Preference","Coffee C - Bitterness","Coffee C - Acidity","Coffee C - Personal Preference","Coffee D - Bitterness","Coffee D - Acidity","Coffee D - Personal Preference"]
data[category_coloumns]=data[category_coloumns].astype("category")
data["What is the most you've ever paid for a cup of coffee?"]=data["What is the most you've ever paid for a cup of coffee?"].astype("category")
# Print the modified DataFrame
print(data.dtypes)

Submission ID                                                object
What is your age?                                          category
How many cups of coffee do you typically drink per day?    category
Where do you typically drink coffee? (At home)                 bool
Where do you typically drink coffee? (At the office)           bool
                                                             ...   
Gender                                                     category
Education Level                                            category
Ethnicity/Race                                             category
Employment Status                                          category
Political Affiliation                                      category
Length: 86, dtype: object


## Filtered Dataset

In [75]:
# Filtered dataset
data.to_csv("filtered.csv",index=False)

In [76]:
basic_stats_filtered=data.describe()
basic_filtered=pd.DataFrame(basic_stats_filtered)
basic_filtered.to_csv("filtered_stats.csv",index=True)

### Missing value imputation

In [77]:
missing_values = data.isnull().sum() / len(data)
#missing_values=missing_values[missing_values > 0.6]
# Create a DataFrame with column names and missing values percentage
missing_values_table = pd.DataFrame({'Column Name': missing_values.index, 'Percentage of Missing Values': missing_values.values})

# Sort the DataFrame by missing values percentage in descending order
missing_values_table = missing_values_table.sort_values(by='Percentage of Missing Values', ascending=False)
Missing_value=pd.DataFrame(missing_values_table)
# Print the DataFrame
print(Missing_value)


                                          Column Name  \
85                              Political Affiliation   
83                                     Ethnicity/Race   
84                                  Employment Status   
82                                    Education Level   
79  Approximately how much have you spent on coffe...   
..                                                ...   
28  Do you usually add anything to your coffee? (F...   
27  Do you usually add anything to your coffee? (S...   
26  Do you usually add anything to your coffee? (M...   
25  Do you usually add anything to your coffee? (N...   
43  What kind of sugar or sweetener do you add? (S...   

    Percentage of Missing Values  
85                      0.127586  
83                      0.093369  
84                      0.093103  
82                      0.088064  
79                      0.070292  
..                           ...  
28                      0.000000  
27                      0.000000  


In [78]:
object_columns = data.select_dtypes(include='object').columns
num_object_columns = len(object_columns)
print(f"Number of object columns: {num_object_columns}")
print("Object columns:", object_columns.tolist())

Number of object columns: 1
Object columns: ['Submission ID']


In [80]:
categorical_columns = data.select_dtypes(include=['category']).columns
boolean_columns = data.select_dtypes(include=['bool']).columns
integer_columns = data.select_dtypes(include=['int', 'float']).columns


# Impute missing values with mode for categorical and boolean columns
for col in categorical_columns.union(boolean_columns):
    data[col].fillna(data[col].mode()[0], inplace=True)

# Impute missing values with median for integer columns
for col in integer_columns:
    data[col].fillna(data[col].median(), inplace=True)

print(data.head())

   Submission ID What is your age?  \
15        Zd694B     <18 years old   
16        QAeYZY     >65 years old   
17        QA5JYA   25-34 years old   
19        jyDqva   18-24 years old   
34        ylqbBg   45-54 years old   

   How many cups of coffee do you typically drink per day?  \
15                                                  3        
16                                                  3        
17                                                  1        
19                                                  2        
34                                                  2        

    Where do you typically drink coffee? (At home)  \
15                                            True   
16                                           False   
17                                            True   
19                                           False   
34                                            True   

    Where do you typically drink coffee? (At the office)  \
15           

In [81]:
# Processed Dataset after feature engineering and missing value imputation
data.to_csv('processed_data.csv',index=False)

In [82]:
processed_stats= pd.DataFrame(data.describe())
processed_stats.to_csv("processed_stats.csv",index=True)

In [83]:
missing_values = data.isnull().sum() / len(data)
#missing_values=missing_values[missing_values > 0.6]
# Create a DataFrame with column names and missing values percentage
missing_values_table = pd.DataFrame({'Column Name': missing_values.index, 'Percentage of Missing Values': missing_values.values})

# Sort the DataFrame by missing values percentage in descending order
missing_values_table = missing_values_table.sort_values(by='Percentage of Missing Values', ascending=False)
Missing_value=pd.DataFrame(missing_values_table)
# Print the DataFrame
print(Missing_value)


                                          Column Name  \
0                                       Submission ID   
54                     Coffee A - Personal Preference   
62                                 Coffee D - Acidity   
61                              Coffee D - Bitterness   
60                     Coffee C - Personal Preference   
..                                                ...   
27  Do you usually add anything to your coffee? (S...   
26  Do you usually add anything to your coffee? (M...   
25  Do you usually add anything to your coffee? (N...   
24                What is your favorite coffee drink?   
85                              Political Affiliation   

    Percentage of Missing Values  
0                            0.0  
54                           0.0  
62                           0.0  
61                           0.0  
60                           0.0  
..                           ...  
27                           0.0  
26                           0.0  


## Dimensionality Reduction

In [84]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3770 entries, 15 to 4041
Data columns (total 86 columns):
 #   Column                                                                                            Non-Null Count  Dtype   
---  ------                                                                                            --------------  -----   
 0   Submission ID                                                                                     3770 non-null   object  
 1   What is your age?                                                                                 3770 non-null   category
 2   How many cups of coffee do you typically drink per day?                                           3770 non-null   category
 3   Where do you typically drink coffee? (At home)                                                    3770 non-null   bool    
 4   Where do you typically drink coffee? (At the office)                                              3770 non-null   bool    
 

In [None]:
# Step 2: Encode True/False to 1/0 for binary columns
data[binary_columns] = data[binary_columns].astype(int)