In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline

%matplotlib inline

In [2]:
# import dataset with primary features already extracted  
pp_data = pd.read_csv("../data/progresspics_2018_primary_features.cvs")
print(pp_data.shape)

(21484, 18)


In [3]:
pp_data.head()

Unnamed: 0,id,sex,age,height,start_weight,end_weight,period_weeks,title,score,url,num_comments,created_utc,author,permalink,timestamp,num_posts,num_posts_cat,post_order
0,7runrc,0.0,23.0,66.0,188.0,159.0,52.0,F/23/5'6 [188 to 159=30 lbs lost over about a ...,7,https://i.redd.it/jpzit5anobb01.jpg,1,1516497668,[deleted],/r/progresspics/comments/7runrc/f2356_188_to_1...,2018-01-20 19:21:08,0,"(-inf, 0.0]",0.0
1,7rvuxx,0.0,25.0,67.0,182.0,144.0,,"F/25/5'7"" [182&gt;144=38lbs] I know I post her...",579,https://imgur.com/OA4Y40U,34,1516511449,[deleted],/r/progresspics/comments/7rvuxx/f2557_18214438...,2018-01-20 23:10:49,0,"(-inf, 0.0]",0.0
2,7ry7om,1.0,22.0,68.0,185.0,155.0,,M / 22 / 5'8'' [185ish lbs &gt; 155lbs = 30lbs...,1,https://i.imgur.com/Sxxuh3B.jpg,1,1516547882,[deleted],/r/progresspics/comments/7ry7om/m_22_58_185ish...,2018-01-21 09:18:02,0,"(-inf, 0.0]",0.0
3,7rzn5p,0.0,26.0,69.0,203.0,135.0,,"F/26/5'9"" 203lbs &gt; 135lbs = 68lbs I've been...",1,https://i.redd.it/vjmc6ivixgb01.jpg,1,1516561189,[deleted],/r/progresspics/comments/7rzn5p/f2659_203lbs_1...,2018-01-21 12:59:49,0,"(-inf, 0.0]",0.0
4,7s0hp0,0.0,26.0,69.0,203.0,135.0,,"F/26/5'9"" [203lbs &gt; 135lbs = 68lbs] I've fe...",576,https://i.redd.it/4wgkz4czihb01.jpg,13,1516568407,[deleted],/r/progresspics/comments/7s0hp0/f2659_203lbs_1...,2018-01-21 15:00:07,0,"(-inf, 0.0]",0.0


In [4]:
# split dataset into train and test
pp_train, pp_test = train_test_split(pp_data, test_size=0.2, random_state=46)

# Feature Engineering

## New feature creation

Here a variety of new features are created and then pipelined together to generate a training dataset that has all possible features in it.  This dataset is used for feature exploration and visualization. 

In [5]:
def date_time_features(df):
    """Extracts the month and day of the week a post was generated.
    
    Argument:
    pandas dataframe with the column "timestamp"
    
    Returns:
    pandas dataframe with the new columns "month" and "dayofweek"
    """
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["month"] = df["timestamp"].dt.month
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    return df

In [6]:
date_features = FunctionTransformer(date_time_features, validate=False)

In [7]:
def key_word_finder(s, words):
    """Recognizes if any key words in the list, words, appear in a string, s. Returns a dictionary where the keys 
    are the words and the value is 1 if the word is present and 0 if it is not.
    
    Helper function for key_words_to_features.

    Argument: 
    s -- string
    words -- list of strings

    Returns:
    results_dict -- dictionary
    """
    results_dict = {}
    lower_s = s.lower()
    for word in words:
        if word in lower_s:
            results_dict[word] = 1
        else:
            results_dict[word] = 0
    return results_dict

def key_words_to_features(df):
    """Creates new columns corresponding to each word in the list "words" and indicates in that column if the 
    title string contains the word of interest. 
    
    Argument:
    pandas dataframe with the column "title"
    
    Returns:
    pandas dataframe with new columns named as listed in the list "words"
    """
    words = ["progress", "face", "goal", "finally", "cico", "keto", "gains", "gym", "lifting", "working", "diet", 
                 "muscle", "nsfw"]
    df_text = df.apply(lambda row: key_word_finder(row["title"], words), axis=1, result_type="expand")
    df_concat = pd.concat([df, df_text], axis='columns')
    return df_concat

In [8]:
key_words = FunctionTransformer(key_words_to_features, validate=False)

In [9]:
def create_weight_diff(df):
    """Calculates the weight change described in a post. 
    
    Argument:
    pandas dataframe with the columns "start_weight" and "end_weight"
    
    Returns:
    pandas dataframe with the new column "weight_diff"
    """
    df["weight_diff"] = df["start_weight"] - df["end_weight"]
    return df

In [10]:
weight_diff = FunctionTransformer(create_weight_diff, validate=False)

In [11]:
def create_gain_or_loss(df):
    """If a post describes weight loss, label as 0 while if the post describes weight gain, label as 1 in
    the column, "gain_or_lose". If no weight is gained or loss, label as 1. 
    
    Argument:
    pandas dataframe with the column "weight_diff"
    
    Returns:
    pandas dataframe with a new column, "gain_or_lose"
    """
    df["gain_or_lose"] = df.loc[:, 'weight_diff'].apply(lambda row:  0 if row > 0 else 1)
    return df

In [12]:
gain_or_loss = FunctionTransformer(create_gain_or_loss, validate=False)

In [13]:
def create_fill_period_weeks(df):
    """Groups instances by sex and if the weigh change was a loss or gain.  Fills "period_weeks" NaNs with the 
    group average.  
    
    Argument: 
    pandas dataframe with the columns "sex", "gain_or_lose", "period_weeks"
    
    Returns:
    pandas dataframe with the new column, "fill_period_weeks"
    """
    df["fill_period_weeks"] = df.groupby(['sex', "gain_or_lose"])['period_weeks'].transform(lambda grp: grp.fillna(np.mean(grp)))
    df.drop(["period_weeks"], axis=1, inplace=True)
    return df

In [14]:
fill_period_weeks = FunctionTransformer(create_fill_period_weeks)

In [15]:
def create_change_rate(df):
    """Calculates and stores the rate of weight change.
    
    Argument:
    pandas dataframe with the columns "weight_diff" and "fill_period_weeks"
    
    Returns:
    pandas dataframe with the new column "change_rate"
    
    """
    df["change_rate"] = df["weight_diff"]/df["fill_period_weeks"]
    return df

In [16]:
change_rate = FunctionTransformer(create_change_rate)

In [17]:
def drop_cols_end_weight(df):
    """Drop columns that are uninformative. 
    
    Argument:
    pandas dataframe
    
    Returns:
    pandas dataframe
    """
    new_df = df.drop(columns=['title', 'author', 'id', 'timestamp', 'created_utc', 'url', 'permalink'], axis=1)
    return new_df

In [18]:
drop_columns_end_weight = FunctionTransformer(drop_cols_end_weight, validate=False)

In [19]:
# chain together the above functions in a pipeline
data_exploration_pipeline = Pipeline([
    ('create_date_features', date_features),
    ('create_key_word_features', key_words),
    ('create_weight_diff', weight_diff),
    ('create_gain_or_lose', gain_or_loss),
    ('create_fill_period_weeks', fill_period_weeks),
    ('create_change_rate', change_rate),
    ('drop_cols_end_weight', drop_columns_end_weight)],
    verbose=True)

## Preparation of the training data

In [20]:
# apply the data prep pipeline to the training data
full_train_data = data_exploration_pipeline.fit_transform(pp_train.copy())

[Pipeline]  (step 1 of 7) Processing create_date_features, total=   0.0s
[Pipeline]  (step 2 of 7) Processing create_key_word_features, total=   1.5s
[Pipeline]  (step 3 of 7) Processing create_weight_diff, total=   0.0s
[Pipeline]  (step 4 of 7) Processing create_gain_or_lose, total=   0.0s
[Pipeline]  (step 5 of 7) Processing create_fill_period_weeks, total=   0.0s
[Pipeline]  (step 6 of 7) Processing create_change_rate, total=   0.0s
[Pipeline]  (step 7 of 7) Processing drop_cols_end_weight, total=   0.0s


In [21]:
full_train_data.shape

(17187, 29)

In [22]:
full_train_data.tail(10)

Unnamed: 0,sex,age,height,start_weight,end_weight,score,num_comments,num_posts,num_posts_cat,post_order,...,gym,lifting,working,diet,muscle,nsfw,weight_diff,gain_or_lose,fill_period_weeks,change_rate
10882,1.0,31.0,71.0,255.0,210.0,906,64,1,"(0.0, 1.0]",1.0,...,0,0,0,0,0,0,45.0,0,44.0,1.022727
15083,1.0,44.0,75.0,277.0,187.0,456,15,1,"(0.0, 1.0]",1.0,...,0,0,0,0,0,0,90.0,0,36.0,2.5
18843,0.0,20.0,63.0,155.0,115.0,355,13,1,"(0.0, 1.0]",1.0,...,0,0,0,1,0,0,40.0,0,52.0,0.769231
5436,0.0,28.0,72.0,448.0,236.0,80,10,0,"(-inf, 0.0]",0.0,...,0,0,0,0,0,0,212.0,0,156.0,1.358974
20317,1.0,28.0,74.0,262.34978,207.23428,178,12,3,"(2.0, 4.0]",3.0,...,1,0,0,0,0,0,55.1155,0,40.0,1.377887
3787,0.0,32.0,61.0,220.0,135.0,1,1,0,"(-inf, 0.0]",0.0,...,0,0,0,0,0,0,85.0,0,54.212171,1.567914
17042,1.0,21.0,71.0,155.0,152.0,45,10,2,"(1.0, 2.0]",2.0,...,0,0,0,0,0,0,3.0,0,4.0,0.75
18835,1.0,22.0,70.0,217.5,189.0,95,8,8,"(4.0, 8.0]",7.0,...,0,0,0,0,0,0,28.5,0,28.0,1.017857
18874,0.0,23.0,65.0,158.0,152.0,106,4,1,"(0.0, 1.0]",1.0,...,0,0,1,1,0,0,6.0,0,156.0,0.038462
15549,0.0,29.0,63.0,245.0,100.0,6319,115,2,"(1.0, 2.0]",1.0,...,0,0,1,0,0,0,145.0,0,54.212171,2.674676


In [23]:
# save full training dataset to a .csv file 
full_train_data.to_csv("../data/progresspics_2018_training_data.cvs", index=False)

## Preparation of the testing data

In [24]:
# apply the data prep pipeline to the testing data
full_test_data = data_exploration_pipeline.transform(pp_test.copy())

In [25]:
full_test_data.shape

(4297, 29)

In [26]:
full_train_data.tail(10)

Unnamed: 0,sex,age,height,start_weight,end_weight,score,num_comments,num_posts,num_posts_cat,post_order,...,gym,lifting,working,diet,muscle,nsfw,weight_diff,gain_or_lose,fill_period_weeks,change_rate
10882,1.0,31.0,71.0,255.0,210.0,906,64,1,"(0.0, 1.0]",1.0,...,0,0,0,0,0,0,45.0,0,44.0,1.022727
15083,1.0,44.0,75.0,277.0,187.0,456,15,1,"(0.0, 1.0]",1.0,...,0,0,0,0,0,0,90.0,0,36.0,2.5
18843,0.0,20.0,63.0,155.0,115.0,355,13,1,"(0.0, 1.0]",1.0,...,0,0,0,1,0,0,40.0,0,52.0,0.769231
5436,0.0,28.0,72.0,448.0,236.0,80,10,0,"(-inf, 0.0]",0.0,...,0,0,0,0,0,0,212.0,0,156.0,1.358974
20317,1.0,28.0,74.0,262.34978,207.23428,178,12,3,"(2.0, 4.0]",3.0,...,1,0,0,0,0,0,55.1155,0,40.0,1.377887
3787,0.0,32.0,61.0,220.0,135.0,1,1,0,"(-inf, 0.0]",0.0,...,0,0,0,0,0,0,85.0,0,54.212171,1.567914
17042,1.0,21.0,71.0,155.0,152.0,45,10,2,"(1.0, 2.0]",2.0,...,0,0,0,0,0,0,3.0,0,4.0,0.75
18835,1.0,22.0,70.0,217.5,189.0,95,8,8,"(4.0, 8.0]",7.0,...,0,0,0,0,0,0,28.5,0,28.0,1.017857
18874,0.0,23.0,65.0,158.0,152.0,106,4,1,"(0.0, 1.0]",1.0,...,0,0,1,1,0,0,6.0,0,156.0,0.038462
15549,0.0,29.0,63.0,245.0,100.0,6319,115,2,"(1.0, 2.0]",1.0,...,0,0,1,0,0,0,145.0,0,54.212171,2.674676


In [27]:
# save full testing dataset to a .csv file 
full_test_data.to_csv("../data/progresspics_2018_testing_data.cvs", index=False)