In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
This is meant to be a very brief introduction to the Machine Learning work flow in building models.
The models done here are in no way state-of-the art or very usefull, but it demonstrates the workflow of data processing, feature engineering, model building and verification.

# Data preprocessing and feature engineering
This will probably take 90% of the ML time...

In [None]:
# Load dataset
dataset = pd.read_csv('/kaggle/input/beer-efficiency/beer_efficiency.csv')
data = dataset.copy()

#Remove any rows with NAN values
data = data.dropna(subset=['name','calories','abv','efficiency'])
data

In [None]:
# Look at the data
def plot_hist(df, col):
    plt.hist(data[col], bins=75)
    plt.xlabel(col)
    plt.ylabel('count')
    plt.show()
    
plot_hist(data, 'calories')
plot_hist(data, 'abv')
plot_hist(data, 'efficiency')


In [None]:
# Find a list of words used in beer names
words = {}
for i, row in data.iterrows():
    name = row['name'].split(' ')
    for word in name:
        if words.get(word,0) == 0:
            words[word] = 1
        else:
            words[word] += 1

word_count = pd.DataFrame.from_dict(words, orient='index').sort_values(by=0, ascending=False)
word_count = word_count.rename(columns={0:'count'})
word_count[word_count['count']>=5]

In [None]:
# Select words that signify a beer type (and have a significant amount of entries in sample names)

beer_types = ['Stout', 'Wheat', 'Porter','Pale','Light','Lager','IPA', 'Beer', 'Ale', 'Amber']

# Turn categorical beer descriptors into values by one-hot-encoding. Large sparse matrix usually... 
for beer_type in beer_types:
    data[beer_type] = data['name'].str.contains(beer_type)*1

data

# Prepare dataset for modelling
Separate dataset into training and testing parts and scale/normalize


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

feature_cols = beer_types + ['calories', 'abv']
label_col = 'efficiency'

X = data[feature_cols]
y = data[label_col]


# Divide data into training and tes
test_fraction = 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=test_fraction,
                                                    shuffle=True,
                                                    random_state=None)

# Data normalization/scaling - This helps model fit converging and can give better models
# Scaling ONLY based on training data, such that test data remains hidden from model
scaler = StandardScaler().fit(X_train) 
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Xtrain = X_train_scaled
Xtest = X_test_scaled

n_features = X_train.shape[1]
n_train = X_train.shape[0]
n_test = X_test.shape[0]

print(f'{n_features} training features:', feature_cols)
print('Training data examples:', n_train)
print('Testing data examples:', n_test)


# Linear regression

Lets try the most basic linear regression

In [None]:
from sklearn.linear_model import LinearRegression 

lr_model = LinearRegression(fit_intercept=True)

lr_model.fit(Xtrain, y_train)

w = lr_model.coef_
b = lr_model.intercept_
linear_coefficients = w
print('Linear feature weights w:\n', w)
print('Bias/intercept term:\n', b)
plt.bar(feature_cols, w)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Model score on training data
prediction_train = lr_model.predict(Xtrain)
RMSE_train = np.round(np.dot(y_train - prediction_train, y_train - prediction_train)**(1/2)/n_train, 2)
R2_train = np.round(lr_model.score(Xtrain, y_train),2)

prediction_test = lr_model.predict(Xtest)
RMSE_test = np.round(np.dot(y_test - prediction_test, y_test - prediction_test)**(1/2)/n_test, 2)
R2_test = np.round(lr_model.score(Xtest, y_test),2)

print(f'Training R2 and RMSE: ({R2_train}, {RMSE_train})')
print(f'Test R2 and RMSE: ({R2_test}, {RMSE_test})')

error_rel_train = (y_train-prediction_train)/y_train
error_rel_test = (y_test-prediction_test)/y_test
plt.plot(error_rel_train,'b.', label='Training')
plt.plot(error_rel_test,'r*',label='Test')
plt.axhline(0)
plt.ylabel('Relative prediction error')
plt.legend()
plt.show()

y = [error_rel_train, error_rel_test]
plt.boxplot(y,labels=['Training','Test'])
plt.axhline(0)
plt.ylabel('Relative prediction error')
plt.show()

# Logistic classification
Lets try to just classify the beers as efficient or not-effecint. If the effeciency is over 70% is is deemed effecient, if below it's deemed non-effecient.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

# Turn continous labels into discrete classes
# over 70% effeciency is classified as "Effecient"
y_train_class = (y_train >= 70)*1 
y_test_class = (y_test >= 70)*1

# Define model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(Xtrain,y_train_class)

w =  log_model.coef_[0]
print('Linear feature weights w:\n', w)
print('Bias/intercept term:\n', log_model.intercept_)
plt.bar(feature_cols, w)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Model score on training and test data
y_pred_train = log_model.predict(Xtrain)
y_pred_test = log_model.predict(Xtest)
accuracy_train = balanced_accuracy_score(y_train_class, y_pred_train)
accuracy_test = balanced_accuracy_score(y_test_class, y_pred_test)
cm_train = confusion_matrix(y_train_class, y_pred_train, labels=[1, 0])
cm_test = confusion_matrix(y_test_class, y_pred_test, labels=[1, 0])

print('Training accuracy:', accuracy_train)
print('Confusion Matrix\n',cm_train)
print('')
print('Testing accuracy:', accuracy_test)
print('Confusion Matrix\n',cm_test)

In [None]:
#Hyper parameter tuning
acc_train = []
acc_test = []
c_test = [0.01, 0.1, 1, 2,3,4,5,6,7,8,9,10]

for c in c_test:
    log_model = LogisticRegression(penalty='l2',
                                   tol=0.00001,
                                   C=c,
                                   class_weight='balanced',
                                   fit_intercept=True,
                                   random_state=None,
                                   max_iter=1000)
    
    log_model.fit(Xtrain,y_train_class)
    
    y_pred_train = log_model.predict(Xtrain)
    acc_train.append(balanced_accuracy_score(y_train_class, y_pred_train))
    
    y_pred_test = log_model.predict(Xtest)
    acc_test.append(balanced_accuracy_score(y_test_class, y_pred_test))

plt.plot(c_test,acc_train, 'b-o',label='Train')
plt.plot(c_test,acc_test, 'r-*', label='Test')
plt.axhline(1)
plt.axvline(1)
plt.legend()
plt.xlabel('C - Regularization parameter')
plt.ylabel('Model accuracy')
plt.show()

In [None]:
# Bestmodel

best_model = LogisticRegression(penalty='l2',
                                   tol=0.00001,
                                   C=2,
                                   class_weight=None,
                                   fit_intercept=True,
                                   random_state=None,
                                   max_iter=1000)

best_model.fit(Xtrain,y_train_class)

# Model score on training and test data
y_pred_train = log_model.predict(Xtrain)
y_pred_test = log_model.predict(Xtest)
accuracy_train = balanced_accuracy_score(y_train_class, y_pred_train)
accuracy_test = balanced_accuracy_score(y_test_class, y_pred_test)
cm_train = confusion_matrix(y_train_class, y_pred_train, labels=[1, 0])
cm_test = confusion_matrix(y_test_class, y_pred_test, labels=[1, 0])

print('Training accuracy:', accuracy_train)
print('Confusion Matrix\n',cm_train)
print('')
print('Testing accuracy:', accuracy_test)
print('Confusion Matrix\n',cm_test)
