# Task for Today  

***

## Soft Drink Sales Prediction  
  
Given *data about soft drinks*, let's try to predict the **quantity sold** of a given drink.  
  
We will use a variety of regression models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
train_df = pd.read_csv('../input/predict-demand/train.csv')
test_df = pd.read_csv('../input/predict-demand/test.csv')

In [None]:
train_df

In [None]:
train_df.info()

In [None]:
test_df

In [None]:
test_df.info()

# Preprocessing

In [None]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

def encode_dates(df, column):
    df = df.copy()
    df[column] = pd.to_datetime(df[column])
    df[column + '_year'] = df[column].apply(lambda x: x.year)
    df[column + '_month'] = df[column].apply(lambda x: x.month)
    df[column + '_day'] = df[column].apply(lambda x: x.day)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop id column
    df = df.drop('id', axis=1)
    
    # Remove missing rows
    missing_rows = df.loc[df.isna().all(axis=1), :].index
    df = df.drop(missing_rows, axis=0).reset_index(drop=True)
    
    # Fill numeric missing values with mean
    for column in ['lat', 'long']:
        df[column] = df[column].fillna(df[column].mean())
    
    # Fill ordinal missing values with mode
    df['capacity'] = df['capacity'].fillna(df['capacity'].mode()[0])
    
    # One-hot encode nominal features
    for column in ['city', 'shop', 'brand', 'container']:
        df = onehot_encode(df, column=column)
    
    # Ordinal encode capacity column
    capacity_ordering = ['330ml', '500ml', '1.5lt']
    df['capacity'] = df['capacity'].apply(lambda x: capacity_ordering.index(x))
    
    # Extract date features
    df = encode_dates(df, column='date')
    
    # Split df into X and y
    y = df['quantity']
    X = df.drop('quantity', axis=1)
    
    return X, y

In [None]:
X_train, y_train = preprocess_inputs(train_df)
X_test, y_test = preprocess_inputs(test_df)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

# Training

In [None]:
models = {
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                     Linear Regression": LinearRegression(),
    "                 Ridge (L2) Regression": Ridge(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                        Neural Network": MLPRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# Results

In [None]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(model.score(X_test, y_test)))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/E1aJk9Z4usM