# Performing a Multilabel classification
## Aim: predicting the cat_name given: abv, country, cat_name , descript (if available)

Steps:  
1. Generate dummies  
2. Standardize abv  
3. Pre-split the data   
4. Test-train-split    
5. Build Preprocessing pipeline  
6. Build Model  
7. Build Pipeline  
8. Assess Model   




In [1]:
#import modules
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer




In [2]:
#define multiclass train test splitter from 
def multilabel_sample(y, size=1000, min_count=5, seed=None):
    """ Takes a matrix of binary labels `y` and returns
        the indices for a sample of size `size` if
        `size` > 1 or `size` * len(y) if size =< 1.
        The sample is guaranteed to have > `min_count` of
        each label.
    """
    try:
        if (np.unique(y).astype(int) != np.array([0, 1])).all():
            raise ValueError()
    except (TypeError, ValueError):
        raise ValueError('multilabel_sample only works with binary indicator matrices')

    if (y.sum(axis=0) < min_count).any():
        raise ValueError('Some classes do not have enough examples. Change min_count if necessary.')

    if size <= 1:
        size = np.floor(y.shape[0] * size)

    if y.shape[1] * min_count > size:
        msg = "Size less than number of columns * min_count, returning {} items instead of {}."
        warn(msg.format(y.shape[1] * min_count, size))
        size = y.shape[1] * min_count

    rng = np.random.RandomState(seed if seed is not None else np.random.randint(1))

    if isinstance(y, pd.DataFrame):
        choices = y.index
        y = y.values
    else:
        choices = np.arange(y.shape[0])

    sample_idxs = np.array([], dtype=choices.dtype)

    # first, guarantee > min_count of each label
    for j in range(y.shape[1]):
        label_choices = choices[y[:, j] == 1]
        label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
        sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])

    sample_idxs = np.unique(sample_idxs)

    # now that we have at least min_count of each, we can just random sample
    sample_count = int(size - sample_idxs.shape[0])

    # get sample_count indices from remaining choices
    remaining_choices = np.setdiff1d(choices, sample_idxs)
    remaining_sampled = rng.choice(remaining_choices,
                                   size=sample_count,
                                   replace=False)

    return np.concatenate([sample_idxs, remaining_sampled])


def multilabel_sample_dataframe(df, labels, size, min_count=5, seed=None):
    """ Takes a dataframe `df` and returns a sample of size `size` where all
        classes in the binary matrix `labels` are represented at
        least `min_count` times.
    """
    idxs = multilabel_sample(labels, size=size, min_count=min_count, seed=seed)
    return df.loc[idxs]

def multilabel_train_test_split(X, Y, size, min_count=5, seed=None):
    """ Takes a features matrix `X` and a label matrix `Y` and
        returns (X_train, X_test, Y_train, Y_test) where all
        classes in Y are represented at least `min_count` times.
    """
    index = Y.index if isinstance(Y, pd.DataFrame) else np.arange(Y.shape[0])

    test_set_idxs = multilabel_sample(Y, size=size, min_count=min_count, seed=seed)
    train_set_idxs = np.setdiff1d(index, test_set_idxs)

    test_set_mask = index.isin(test_set_idxs)
    train_set_mask = ~test_set_mask

    return (X[train_set_mask], X[test_set_mask], Y[train_set_mask], Y[test_set_mask])


In [8]:
#Loading the data
df = pd.read_csv(r"data\cleaned\beer_cleaned.csv")
print(df.head())

#define numeric columns
LABELS = ["style_name"]
NON_LABEL = [c for c in df.columns if c != "style_name"]
NUMERIC = list(df.loc[:, df.dtypes != "object"].columns) # Non-label>numeric




#Writing in pipeline reuseable text-getter function
def combine_text_columns(data_frame, to_drop=NUMERIC + LABELS):
    """ Takes the dataset as read in, drops the non-feature, non-text columns and
        then combines all of the text columns into a single vector that has all of
        the text for a row.
        
        :param data_frame: The data as read in with read_csv (no preprocessing necessary)
        :param to_drop (optional): Removes the numeric and label columns by default.
    """
    # drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # replace nans with blanks
    text_data.fillna("", inplace=True)
    
    # joins all of the text items in a row (axis=1)
    # with a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

#generating pipeline objects to use in pipeline
get_numeric_features = FunctionTransformer(lambda x : x[NUMERIC], validate=False)
get_text_features = FunctionTransformer(combine_text_columns, validate = False)


#1. get dummies:
dummy_labels = pd.get_dummies(df["style_name"])

X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABEL], dummy_labels, 0.2, 1)

pipe = Pipeline([("union",FeatureUnion([
                                        ("numerics", Pipeline([
                                                                ("selector",get_numeric_features),
                                                                ("imputer", SimpleImputer()),
                                                                ("scaler",StandardScaler())
                                                                ])
                                        ),
                                        ("texts", Pipeline([
                                                            ("selector", get_text_features),
                                                            ("vectorizer", CountVectorizer(ngram_range=(1,2)))
                                                            ])
                                        )
                                        ])
                ),
                ("clf", OneVsRestClassifier(LogisticRegression()))
                ])
pipe.fit(X_train, y_train)
y_pred_proba = pipe.predict_proba(X_test)
print(y_pred_proba)
pipe.score(X_test, y_test)

name_beer       abv               name_brewery              city  \
0       Hocus Pocus  4.500000                  Magic Hat  South Burlington   
1   BiÃ¨re Darbyste  5.107576      Brasserie de Blaugies          Blaugies   
2     Isolation Ale  6.000000              Odell Brewing      Fort Collins   
3   Nut Cracker Ale  5.900000  Boulevard Brewing Company       Kansas City   
4  Never Summer Ale  5.940000       Boulder Beer Company           Boulder   

         country  latitude  longitude  \
0  United States   44.4284   -73.2131   
1        Belgium   50.3693     3.8270   
2  United States   40.5894  -105.0630   
3  United States   39.0821   -94.5965   
4  United States   40.0267  -105.2480   

                                            descript     cat_name  \
0  Our take on a classic summer ale.  A toast to ...  Other Style   
1                                                NaN  Other Style   
2  Ever been in a warm, cozy cabin and had a secr...  Other Style   
3  Nutcracker Ale 

0.5390835579514824