In [None]:
import re
import os
import numpy as np
import pandas as pd
import random
import seaborn as sns
from time import time
from collections import defaultdict

from contextlib import contextmanager
from tqdm import tqdm
import lightgbm as lgbm
import category_encoders as ce

import spacy
from wordcloud import WordCloud
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import multiprocessing

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, log_loss, accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.utils import to_categorical

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data preparation

In [None]:
data0 = pd.read_csv("../input/calories-burned-during-exercise-and-activities/exercise_dataset.csv")
data0[0:2]

In [None]:
data0.columns

In [None]:
data0['Activity, Exercise or Sport (1 hour)'].value_counts()

In [None]:
data1=data0[['Activity, Exercise or Sport (1 hour)','Calories per kg']]

In [None]:
data2=data1.sort_values(by='Calories per kg', ascending=False).reset_index(drop=True)
data2

In [None]:
fig=make_subplots(specs=[[{"secondary_y":False}]])
fig.add_trace(go.Bar(x=data2['Activity, Exercise or Sport (1 hour)'],y=data2['Calories per kg'],name=""),secondary_y=False,)
fig.update_layout(autosize=False,width=1200,height=800,title_text="Activity and Calories per kg")
fig.update_xaxes(title_text="Activity")
fig.update_yaxes(title_text="calory",secondary_y=False)
fig.show()

In [None]:
nlp=spacy.load('en_core_web_lg',disable=['ner','parser'])

def cleaning(doc):
    txt=[token.lemma_ for token in doc if not token.is_stop]
    if len(txt)>2:
        return ' '.join(txt)

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in data2['Activity, Exercise or Sport (1 hour)'])

In [None]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]
print(txt[0:20])

In [None]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
print(df_clean.shape)
print(df_clean[0:20])

In [None]:
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i]+=1
print(len(word_freq))

In [None]:
words=sorted(word_freq, key=word_freq.get, reverse=True)
print(words)

In [None]:
plt.figure(figsize=(15,10))
wordcloud = WordCloud(background_color='white',width=750,height=500).generate(" ".join(words))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('',fontsize=20)

In [None]:
cores = multiprocessing.cpu_count()
print(cores)

In [None]:
data2[words]=0
data3=data2
data3

In [None]:
len(words)

In [None]:
for i in range(len(data3)):
    for j in range(len(words)):
        if words[j] in data3.loc[i,'Activity, Exercise or Sport (1 hour)']:
            data3.loc[i,words[j]]=1
data3

# Target setting

In [None]:
data3=data3.drop('Activity, Exercise or Sport (1 hour)',axis=1)

In [None]:
target=['Calories per kg']
datay=data3[target]
datax=data3.drop(target,axis=1)

In [None]:
train=np.array(datax)
trainy=np.array(datay)
print(train.shape)
print(trainy.shape)

In [None]:
df_columns = list(datax.columns)
print(df_columns)

In [None]:
train_df=pd.DataFrame(train)
#test_df=pd.DataFrame(test)

In [None]:
train_df.columns=df_columns
#test_df.columns=df_columns

In [None]:
def create_numeric_feature(input_df):
    use_columns = df_columns 
    return input_df[use_columns].copy()

In [None]:
from contextlib import contextmanager
from time import time

class Timer:
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' '):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

In [None]:
from tqdm import tqdm

def to_feature(input_df):

    processors = [
        create_numeric_feature,
    ]
    
    out_df = pd.DataFrame()
    
    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='create' + func.__name__ + ' '):
            _df = func(input_df)

        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
        
    return out_df

In [None]:
train_feat_df = to_feature(train_df)
#test_feat_df = to_feature(test_df)

# Model

In [None]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error

def fit_lgbm(X, y, cv, 
             params: dict=None, 
             verbose: int=50):

    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMRegressor(**params)
        
        with Timer(prefix='fit fold={} '.format(i)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=100,
                    verbose=verbose)

        pred_i = clf.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        print(f'Fold {i} RMSLE: {mean_squared_error(y_valid, pred_i) ** .5:.4f}')
        print()

    score = mean_squared_error(y, oof_pred) ** .5
    print('-' * 50)
    print('FINISHED | Whole RMSLE: {:.4f}'.format(score))
    return oof_pred, models

In [None]:
params = {
    'objective': 'rmse', 
    'learning_rate': .1,
    'reg_lambda': 1.,
    'reg_alpha': .1,
    'max_depth': 5, 
    'n_estimators': 10000, 
    'colsample_bytree': .5, 
    'min_child_samples': 10,
    'subsample_freq': 3,
    'subsample': .9,
    'importance_type': 'gain', 
    'random_state': 71,
    'num_leaves': 62
}

In [None]:
y = trainy
print(y.shape)

In [None]:
ydf=pd.DataFrame(y)
ydf

In [None]:
from sklearn.model_selection import KFold

for i in range(1):
    fold = KFold(n_splits=5, shuffle=True, random_state=71)
    ydfi=ydf.iloc[:,i]
    y=np.array(ydfi)
    cv = list(fold.split(train_feat_df, y))
    oof, models = fit_lgbm(train_feat_df.values, y, cv, params=params, verbose=500)
    
    fig,ax = plt.subplots(figsize=(6,6))
    ax.set_title(target[i],fontsize=20)
    ax.set_xlabel('oof '+str(i),fontsize=12)
    ax.set_ylabel('train_y '+str(i),fontsize=12)
    ax.scatter(oof,y)


# Visualize Importance

In [None]:
def visualize_importance(models, feat_train_df):

    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x='feature_importance', 
                  y='column', 
                  order=order, 
                  ax=ax, 
                  palette='viridis', 
                  orient='h')
    
    ax.tick_params(axis='x', rotation=0)
    #ax.set_title('Importance')
    ax.grid()
    fig.tight_layout()
    
    return fig,ax

#fig, ax = visualize_importance(models, train_feat_df)

In [None]:
for i in range(1):
    fold = KFold(n_splits=5, shuffle=True, random_state=71)
    ydfi=ydf.iloc[:,i]
    y=np.array(ydfi)
    cv = list(fold.split(train_feat_df, y))
    oof, models = fit_lgbm(train_feat_df.values, y, cv, params=params, verbose=500)
    fig, ax = visualize_importance(models, train_feat_df)
    ax.set_title(target[i]+' Imortance',fontsize=20)


### 'min', 'mile' and 'mph' are the most important factors for 'Calories per kg', . 