In [2]:
import pandas as pd
import numpy as np
import re

# Data Cleaning and Primary Feature Extraction

In [3]:
# import data set and remove instances with duplicate titles

raw_data = pd.read_csv("../data/pp_data_2018_raw.csv")
print(raw_data.shape)
raw_data.drop_duplicates(subset="title", keep="first", inplace=True)
print(raw_data.shape)

(27464, 9)
(27040, 9)


In [4]:
raw_data.head()

Unnamed: 0,title,score,id,url,num_comments,created_utc,author,permalink,timestamp
0,"M/22/6'3"" [290lbs &gt; 185lbs = 105lbs] (2 yea...",246,7ru6qv,https://i.redd.it/h8dcbplx9bb01.jpg,10,1516492874,Bravo243,/r/progresspics/comments/7ru6qv/m2263_290lbs_1...,2018-01-20 18:01:14
1,"F/27/5'10"" [355lbs &gt; 340lbs = 15lbs] (&gt;1...",576,7rvdhz,https://i.redd.it/2z109s36acb01.jpg,31,1516505494,deemacd,/r/progresspics/comments/7rvdhz/f27510_355lbs_...,2018-01-20 21:31:34
2,F/23/5’0” [260 &gt; 218 = 42lbs] (12 months) F...,210,7rvemg,https://i.redd.it/i6ko6vu0dcb01.jpg,5,1516505869,momicaj,/r/progresspics/comments/7rvemg/f2350_260_218_...,2018-01-20 21:37:49
3,F/24/5’3” (160 SW &gt; 135 CW &gt; 125 GW) 25l...,1,7rvnzq,https://i.imgur.com/pzdTB2G.jpg,1,1516508983,solesky,/r/progresspics/comments/7rvnzq/f2453_160_sw_1...,2018-01-20 22:29:43
4,F/48/5’6” [326 lbs &gt; 180lbs= 146 lbs lost] ...,3163,7rvoaj,https://i.redd.it/g7sy9hwamcb01.jpg,144,1516509075,Wendyjay34,/r/progresspics/comments/7rvoaj/f4856_326_lbs_...,2018-01-20 22:31:15


In [5]:
# this block of functions are helper functions for the main function, process_title

def clean_sex(s):
    """Processes the sex string and returns 0 (for female) or 1 (for male).  Returns Nan if the string is anything
    other than a 0 or 1.
    
    Helper function for get_stats.

    Arguments:
    s -- string that is supposed to represent a sex

    Returns:
    integer 0 (male) or 1 (female) indicating the sex of the author or Nan if unknown
    """
    try:
        if s[0] == "F" or s[-1] == "F":
            return 0
        if s[0] == "M" or s[-1] == "M":
            return 1
    except IndexError:    
        return np.nan   
    else:
        return np.nan
    
    
def clean_age(s):
    """Processes the age string.  Returns an integer corresponding to a 2 digit age.  If the string contains anything 
    other than 2 digits, returns Nan. 
    
    Helper function for get_stats.
    
    Arguments:
    s -- string that is supposed to represent an age
    
    Returns:
    integer representing the age or Nan if unknown
    """
    if len(s) == 2 and s.isdigit():
        return int(s)
    else:
        return np.nan
    
    
def height_inches(s):
    """Processes the height string and returns an integer corresponding to the height in inches.  First removes all
    non-digit characters.  The starting heights are in either feet and inches or centimeters.  If the first character 
    in the string is a 1, it is assumed the height is in centimeters.  If the first character is a 4, 5, 6, or 7 it is
    assumed the first character of the string corresponds to the feet measurement and the remainder to inches measurement.
    Returns Nan if the string is empty or starts with a character other than 1, 4, 5, 6, or 7.
    
    Helper function for get_stats.

    Argument:
    s -- string representing a height

    Returns:
    integer representing a height in inches
    """
    ft_list = ["4", "5", "6", "7"]
    digit_chars = [c for c in list(s) if c.isdigit()]
    num_s = "".join(digit_chars)
    if num_s == '':
        return np.nan
    elif num_s[0] == "1":
        return int(num_s) * 0.39370079 
    elif num_s[0] in ft_list:
        if len(num_s) == 1:
            return int(num_s) * 12
        elif len(num_s) == 3 and num_s[2] == "5":
            return (int(num_s[0]) * 12) + int(num_s[1]) + 0.5
        else:
            return (int(num_s[0]) * 12) + int(num_s[1:])
    else:
        return np.nan
    
    
def get_weights(s):
    """Extracts the starting and ending weights from a string containing both weights. If the weights cannot be 
    identified returns Nan.
    
    Helper function for get_stats. 

    Argument:
    s -- string containing both starting and ending weights

    Returns:
    starting weight -- starting weight of r/progresspics post author
    ending weight -- ending weight of r/progresspics post author
    """
    clean_s = s.upper().lstrip().replace(' ', '')
    # original regex = re.compile(r"^(?:\D*?)(\d+)(?:\D*?)(\d+)")
    regex = re.compile(r"^(?:\D*?)(\d+\.?\d*)(?:\D*?)(\d+\.?\d*)")
    result = regex.search(clean_s) 
    if result:
        return float(result.group(1)), float(result.group(2))
    else:
        return np.nan, np.nan
    
    
def get_stats(s):
    """Processes a r/progresspics post title and extracts the sex, age, height, and weights.  Returns "unknown" if 
    the title is formatted incorrectly and the information cannot be extracted. Uses helper modules to further 
    process the sex, age, height, and weights into their final format.
    
    Helper function for process_title. 

    Argument:
    s -- string containing the r/progresspics post title

    Returns:
    sex -- sex (0 or 1) of r/progresspics post author (integer)
    age -- age in years of r/progresspics post author (integer)
    height -- height in inches of r/progresspics post author (integer)
    weights -- starting and ending weight in lbs of r/progresspics post author (integer)
    """
    clean_s = s.upper()
    clean_list = clean_s.split("/", 2)
    if len(clean_list) < 3:
        return "unknown", "unknown", "unknown", "unknown", "unknown"
    sex = clean_sex(clean_list[0].replace(' ', ''))
    age = clean_age(clean_list[1].replace(' ', ''))
    sep_list = ['[', '/', '"', '\u201d', '\u201c', "' '", ' ']
    temp_weights = "unknown"
    for sep in sep_list:
        if temp_weights == "unknown":
            weight_list = clean_list[2].split(sep, 1)
            temp_height = weight_list[0].replace(' ', '')
            try:    
                temp_weights = weight_list[1]
            except: 
                continue       
    height = height_inches(temp_height)
    if temp_weights == "unknown":
        start_weight, end_weight = np.nan, np.nan
    else:
        start_weight, end_weight = get_weights(temp_weights)
    return sex, age, height, start_weight, end_weight 


def age_sex_redo(s):
    """As switching the order of sex and age in the title is a common mistake by p/progresspics users, 
    the function reprocesses the first two parts of the title and processes the sex position as the age
    and vice versa.
    
    Helper function for switch_age_sex.
    
    Argument:
    s -- string containing the r/progresspics post title
    
    Returns: 
    sex -- sex (0 or 1) of r/progresspics post author, integer
    age -- age in years of r/progresspics post author, integer 
    """
    clean_s = s.upper()
    clean_list = clean_s.split("/", 2)
    sex = clean_sex(clean_list[1].replace(' ', ''))
    age = clean_age(clean_list[0].replace(' ', ''))
    return sex, age 


def switch_age_sex(df):
    """
    Identifies instances in the df where the "age" and "sex" columns contain NaNs.  Uses a helper function to 
    process the "sex" column as an "age" column and vice versa then updates the original dataframe. 
    
    Helper function for process_title. 
    
    Argument:
    pandas dataframe containning "sex" and "age" columns
    
    Returns: 
    pandas dataframe
    """
    switch = df[df["sex"].isnull() & df["age"].isnull()].copy()
    switch[["sex", "age"]] = switch.apply(lambda row: age_sex_redo(row["title"]), axis=1, result_type="expand")
    df.update(switch, overwrite=False)
    df.dropna(axis=0, inplace=True)
    return df 

 
def weight_in_kg(df):
    """
    Identifies entries where the weights are likely to have been entered in kilograms. Converts "start_weight"
    and "end_weight" values for these instances to lbs. 
    
    Helper function for process_title. 
    
    Argument:
    pandas df with "title", "start_weight", and "end_weight" columns
    
    Returns:
    pandas df 
    """
    kg_df = df.loc[(df["title"].str.contains(r"\d+\s*KG", flags=re.IGNORECASE)) & (df["start_weight"] < 160)].copy()
    kg_df["start_weight"] = kg_df["start_weight"] * 2.20462
    kg_df["end_weight"] = kg_df["end_weight"] * 2.20462
    df.update(kg_df)
    return df


def duration_in_weeks(period, unit):
    """Given a number and unit of time, converts the time duration to the equivalent number of weeks.
    
    Helper function for get_period_weeks. 

    Arguments:
    period - string containing a digit
    unit - one of the following strings: "day", "week", "month", or "year"

    Returns:
    a float representing a number of weeks
    """
    if unit.lower()[0] == 'd':
        period_weeks = float(period)/7
    elif unit.lower()[0] == 'w':
        period_weeks = float(period)
    elif unit.lower()[0] == "m":
        period_weeks = float(period) * 4
    elif unit.lower()[0] == 'y':
        period_weeks = float(period) * 52
    else:
        return np.nan 
    if period_weeks < 2 or period_weeks > 780:
        return np.nan
    else:
        return period_weeks
    

def get_period_weeks(s):
    """Identifies a string with the pattern of a digit followed by a unit of time.  Uses a helper function to 
    interpret the duration of time represented in weeks by the identified substring. 
    
    Helper function for process_title. 
    
    Argument:
    s -- string representing a r/progresspsics title 
    
    Returns:
    integer representing a number of weeks 
    """
    clean_s = s.lower().replace(' ', '')
    pattern = re.compile(r"(\d+\.*\d*|one|two|a)(day|week|month|year|mos)")
    result = pattern.search(clean_s)
    try:
        duration = result.group(1)
        unit = result.group(2)
    except:
        return np.nan
    if duration == "one" or duration == "a":
        duration = "1"
    if duration == "two":
        duration = "2"
    period_weeks = duration_in_weeks(duration, unit)
    return period_weeks

In [6]:
def process_title(df):
    """Takes a dataframe of r/progresspics post titles and related metadata and processes a copy of title column to 
    extract the age, sex, height, starting weight, ending weight, and time period of weight change of the
    user who created the title using helper functions. 
    Returns a new df that contains "age", "sex", "height", "start_weight", "end_weight", "period_weeks"
    and as well the unique index of the post so it can be merged with the original dataframe. 
    
    Argument:
    pandas dataframe
    
    Returns:
    pandas dataframe
    """
    temp_data = df.loc[:, ["id", "title"]].copy()
    print("starting df shape", temp_data.shape)
    temp_data = temp_data.reindex(columns=['id', 'title', 'sex', 'age', 'height', 'start_weight', 'end_weight'])
    temp_data[['sex', 'age', 'height', 'start_weight', 'end_weight']] = temp_data.apply(lambda row: get_stats(row["title"]), axis=1, result_type="expand")
    print("df shape after running get_stats", temp_data.shape)
    temp_data = temp_data[temp_data["sex"] != "unknown"]
    print("df shape after removing instances were stats could not be extracted from the title", temp_data.shape)
    temp_data = switch_age_sex(temp_data)
    temp_data = weight_in_kg(temp_data)
    temp_data["period_weeks"] = temp_data.apply(lambda row: get_period_weeks(row["title"]), axis=1)
    num_cols = ['sex', 'age', 'height', 'start_weight', 'end_weight'] 
    temp_data[num_cols] = temp_data[num_cols].apply(pd.to_numeric, errors="coerce") 
    temp_data.drop(["title"], axis=1, inplace=True)
    final_df = pd.merge(temp_data, df, how="left", on="id") 
    print("final shape of df", final_df.shape)
    return final_df

In [7]:
processed_data = process_title(raw_data)

starting df shape (27040, 2)
df shape after running get_stats (27040, 7)
df shape after removing instances were stats could not be extracted from the title (23379, 7)
final shape of df (22443, 15)


In [8]:
# remove instances with outlier feature

def remove_outlier_data(df):
    """Takes a dataframe and removes rows that contain outliers in the age, height, start_weight, and end_weight
    columns.  Outliers usually point to errors in extracting stats from the title so the entire row is removed. 
    Cut offs for outliers where determined experimentally and by examination of the raw data. 
    
    Argument:
    Pandas dataframe with "age", "height", "start_weight", and "end_weight" columns
    
    Returns:
    Pandas dataframe
    """
    print(df.shape)
    df = df[(df["age"] >= 13) & (df["age"] <= 68)]
    print("after removing age outliers", df.shape)
    df = df[(df["height"] >= 54) & (df["height"] <= 85)]
    print("after removing height outliers", df.shape)
    df = df[(df["start_weight"] >= 78) & (df["start_weight"] <= 775)]
    print("after removing start_weight outliers", df.shape)
    df = df[(df["end_weight"] >= 92) & (df["end_weight"] <= 601)]
    print("after removing end_weight outliers", df.shape)
    return df

In [9]:
pp_data_no = remove_outlier_data(processed_data)

(22443, 15)
after removing age outliers (22429, 15)
after removing height outliers (22094, 15)
after removing start_weight outliers (21759, 15)
after removing end_weight outliers (21484, 15)


In [10]:
# generate additional features

def create_num_posts(df):
    """
    Takes a dataframe and counts of the number of posts generated by each post author within the dataset.
    Adds the number of posts created by the author of each individual post to a new column, "num_posts".  
    Adds 0 if the post was authored by "[deleted]".
    
    Helper function for create_author_features. 
    
    Argument:
    pandas dataframe containing the columns "author" and "sex"
    
    Returns:
    pandas dataframe with a new column, "num_posts"
    """
    df["num_posts"] = df.groupby(['author'])['sex'].transform('count')
    df.loc[df["author"] == "[deleted]", "num_posts"] = 0
    return df


def create_num_posts_cat(df):
    """Creates a new column categorizing the number of posts by the author of each entry. The categories
    are:  posts authored by ["deleted'] whose num_post number is 0, authors who posted once, authors who posted 
    twice, authors who posted 3-4 times, authors who posted 5-8 times, and authors who posted more than 9 times. 
    
    Helper function for create_author_features. 
    
    Argument:
    pandas df with the column, "num_posts"
    
    Returns
    pandas df
    """
    df["num_posts_cat"] = pd.cut(df["num_posts"], [-np.inf, 0, 1, 2, 4, 8, np.inf])
    return df


def create_post_order(df):
    """If a post is the only one added by the author, it is assigned a 0.  If the post is the first one posted 
    chronologically by an author that posted many times, it is assigned a 1.  If the post is the second one posted
    chronologically by an author that posted many times, it is assigned a 2 and so on. 
    
    Helper function for create_author_features. 
    
    Argument:
    pandas dataframe containing the columns "num_posts", "author", and "created_utc"
    
    Returns:
    pandas dataframe containing the new column "post_order"
    """
    del_author = df[df["author"] == "[deleted]"].copy()
    del_author['post_order'] = 0
    no_del_author = df[df["author"] != "[deleted]"].copy()
    no_del_author.sort_values("created_utc", inplace=True)
    no_del_author["post_order"] = df.groupby(["author"])["created_utc"].rank("dense", ascending=True)
    new_df = pd.concat([del_author, no_del_author])
    return new_df

def create_author_features(df):
    """Extracts information about how many times an author has created a post in the dataset, then categorizes 
    that information.  "num_posts" and "num_posts_cat" stores this author info. Also labels posts by the same 
    author if they were posted first, second, third and so on.
    
    Argument:
    pandas dataframe with the columns "author" and "sex"
    
    Returns: 
    pandas dataframe with the new columns "num_posts", "num_posts_cat", and "post_order"
    """
    temp_df_one = create_num_posts(df)
    temp_df_two = create_num_posts_cat(temp_df_one)
    temp_df_three = create_post_order(temp_df_two)
    return temp_df_three

In [12]:
primary_features = create_author_features(pp_data_no)

In [13]:
primary_features.shape

(21484, 18)

In [14]:
primary_features.tail()

Unnamed: 0,id,sex,age,height,start_weight,end_weight,period_weeks,title,score,url,num_comments,created_utc,author,permalink,timestamp,num_posts,num_posts_cat,post_order
20532,abc759,0.0,22.0,67.0,306.0,217.0,,F/22/5’7” [306lbs&gt;217lbs=89lbs] Crazy chang...,457,https://i.redd.it/0l42v2p90p721.jpg,35,1546296632,jltw22,/r/progresspics/comments/abc759/f2257_306lbs21...,2018-12-31 16:50:32,4,"(2.0, 4.0]",4.0
22284,abc853,1.0,19.0,77.0,270.0,210.0,52.0,"M/19/6'5"" [270lbs &gt; 210lbs = 60lbs] (1 year...",329,https://i.redd.it/pl3mq4im0p721.jpg,20,1546296824,JerDude0711,/r/progresspics/comments/abc853/m1965_270lbs_2...,2018-12-31 16:53:44,1,"(0.0, 1.0]",1.0
22201,abcb9h,0.0,23.0,63.0,245.0,199.0,48.0,"F/23/5'3"" [245lbs &gt; 199lbs = 46lbs] (12 mon...",217,https://i.redd.it/4z0wlzmh2p721.jpg,17,1546297380,illusivealchemy,/r/progresspics/comments/abcb9h/f2353_245lbs_1...,2018-12-31 17:03:00,1,"(0.0, 1.0]",1.0
21496,abcqkw,1.0,25.0,69.0,235.0,165.0,52.0,"M/25/5'9"" [235 -&gt; 165 = 70lbs] (1 year) Try...",97,https://i.redd.it/3aq0tp2iap721.jpg,4,1546300074,kobebean21,/r/progresspics/comments/abcqkw/m2559_235_165_...,2018-12-31 17:47:54,4,"(2.0, 4.0]",4.0
20533,abcrg5,0.0,46.0,59.0,175.0,117.0,24.0,F/46/4'11 [175&gt;117=58] (6 months) Had enoug...,6842,https://i.redd.it/8vo11m3zap721.jpg,112,1546300233,sedagive99,/r/progresspics/comments/abcrg5/f46411_1751175...,2018-12-31 17:50:33,1,"(0.0, 1.0]",1.0


In [15]:
# save primary features to a .csv file 
primary_features.to_csv("progresspics_2018_primary_features.cvs", index=False)