# Task for Today  

***

## Restaurant Tip Prediction  

Given *data about transactions at a restaurant*, let's try to predict the **tip** a given customer will leave.

We will use a linear regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

In [None]:
data = pd.read_csv('../input/tipping/tips.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Binary encoding
    df['sex'] = df['sex'].replace({'Female': 0, 'Male': 1})
    df['smoker'] = df['smoker'].replace({'No': 0, 'Yes': 1})
    df['time'] = df['time'].replace({'Lunch': 0, 'Dinner': 1})
    
    # Ordinal encoding
    df['day'] = df['day'].replace({'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 3})
    
    return df

In [None]:
X = preprocess_inputs(data)

In [None]:
X

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(20, 10))

for i in range(len(X.columns)):
    plt.subplot(2, 4, i + 1)
    if len(X[X.columns[i]].unique()) > 2:
        sns.histplot(X[X.columns[i]], kde=True)
    else:
        sns.histplot(X[X.columns[i]], kde=False)
    if i == 0:
        plt.title("Column Distributions")
        
plt.show()

In [None]:
plt.figure(figsize=(20, 10))

for i in range(len(X.columns)):
    plt.subplot(2, 4, i + 1)
    sns.boxplot(x=X[X.columns[i]])
    if i == 0:
        plt.title("Column Boxplots")
        
plt.show()

In [None]:
sns.pairplot(X.loc[:, ['total_bill', 'tip']])

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(X.corr(), annot=True, vmin=-1.0, cmap='mako')
plt.title("Correlation Matrix")
plt.show()

# Training

In [None]:
X = X.sample(frac=1.0, random_state=1).reset_index(drop=True)

In [None]:
y = X['tip']
X = X.drop('tip', axis=1)

In [None]:
results = []
kf = KFold(n_splits=5)

for train_idx, test_idx in kf.split(X):
    
    X_train = X.iloc[train_idx, :]
    X_test = X.iloc[test_idx, :]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    results.append(model.score(X_test, y_test))

# Results

In [None]:
for i in range(len(results)):
    print("Fold {}: {:.5f}".format(i + 1, results[i]))

print("\nAverage R^2 Score: {:.5f}".format(np.mean(results)))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/qCk8nKt2sS0