# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('../input/food-preferences/Food_Preference.csv')

In [None]:
data

In [None]:
data.drop(['Timestamp', 'Participant_ID'], axis=1, inplace=True)

In [None]:
data

In [None]:
data.info()

# Preprocessing

## Missing Values

In [None]:
data.dropna(axis=0, inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
data['Age']

In [None]:
age_bins = pd.qcut(data['Age'], q=2, labels=[0, 1])

In [None]:
pd.concat([data['Age'], age_bins], axis=1)

In [None]:
data['Age'] = age_bins

## Encoding

In [None]:
data

In [None]:
categorical_features = ['Gender', 'Nationality', 'Food', 'Juice', 'Dessert']

In [None]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

In [None]:
get_uniques(data, categorical_features)

In [None]:
binary_features = ['Gender', 'Food', 'Juice']

ordinal_features = ['Dessert']

nominal_features = ['Nationality']

In [None]:
def binary_encode(df, column, positive_label):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df

In [None]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [None]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis=1)
    df.drop(column, axis=1, inplace=True)
    return df

In [None]:
data = binary_encode(data, 'Gender', 'Male')
data = binary_encode(data, 'Food', 'Traditional food')
data = binary_encode(data, 'Juice', 'Fresh Juice')

dessert_ordering = ['No', 'Maybe', 'Yes']
data = ordinal_encode(data, 'Dessert', dessert_ordering)

data = onehot_encode(data, 'Nationality')

In [None]:
data

## Scaling and Splitting

In [None]:
y = data['Age']
X = data.drop('Age', axis=1)

In [None]:
scaler = MinMaxScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

# Training

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/r0eaUpurifA