# 3. Feature Engineering

In [8]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [10]:
df_cleaned = pd.read_csv('data/df_cleaned.csv')
df_cleaned.head()

Unnamed: 0,memberID,holdtime,puzzlepack,pack_name,piece_count_1,piece_count_2,difficulty_rating_1,difficulty_rating_2,brand_1,brand_2,num_puzzles
0,member1,2.939411,Artifact Puzzles Justin Hillgrove Word Travels...,Artifact Puzzles Justin Hillgrove Word Travels...,456,548,1,2,Artifact,Artifact,2
1,member1,0.998885,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,195,220,1,3,DaVici,DaVici,2
2,member1,10.865032,DaVici Puzzles Flying Frigate DaVici Puzzles H...,DaVici Puzzles Flying Frigate DaVici Puzzles H...,332,164,1,1,DaVici,DaVici,2
3,member1,22.083971,Liberty Puzzles Haeckel Hummingbirds Nautilus ...,Liberty Puzzles Haeckel Hummingbirds Nautilus ...,485,222,2,2,Liberty,Nautilus,2
4,member1,5.077603,DaVici Puzzles Diana Zimens City Of Cats,DaVici Puzzles Diana Zimens City Of Cats,700,0,2,2,DaVici,DaVici,1


### First feature is number of pieces for each difficulty d1, d2, d3, d4

In [11]:
# taking code from jon's branch
pieces_by_difficulty = []

for i, row in df_cleaned.iterrows():
    out = {
        'name': row['pack_name'],
        'pieces_d1': 0,
        'pieces_d2': 0,
        'pieces_d3': 0,
        'pieces_d4': 0,
        'num_puzzles': row['num_puzzles']
    }
    
    if not math.isnan(row['piece_count_1']):
        out[f'pieces_d{str(int(row["difficulty_rating_1"]))}'] += row['piece_count_1']
    
    if(row['num_puzzles'] == 2 and math.isnan(row['piece_count_1']) == False):
        out[f'pieces_d{str(int(row["difficulty_rating_2"]))}'] += row['piece_count_2']
        
    pieces_by_difficulty.append(out)

In [12]:
pieces_df = pd.DataFrame(pieces_by_difficulty)
pieces_df.head()

Unnamed: 0,name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles
0,Artifact Puzzles Justin Hillgrove Word Travels...,456,548,0,0,2
1,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,195,0,220,0,2
2,DaVici Puzzles Flying Frigate DaVici Puzzles H...,496,0,0,0,2
3,Liberty Puzzles Haeckel Hummingbirds Nautilus ...,0,707,0,0,2
4,DaVici Puzzles Diana Zimens City Of Cats,0,700,0,0,1


### Second Feature is counting total number of pieces at pack level

In [16]:
# making things at pack level
df_cleaned['piece_count_pack'] = 0
df_cleaned['difficulty_rating_pack'] = 0
for i in range(0, len(df_cleaned)):
    if df_cleaned['num_puzzles'][i] == 1:
        df_cleaned['piece_count_pack'][i] = df_cleaned['piece_count_1'][i]
        df_cleaned['difficulty_rating_pack'][i] = df_cleaned['difficulty_rating_1'][i]
    else:
        df_cleaned['piece_count_pack'][i] = df_cleaned['piece_count_1'][i] + df_cleaned['piece_count_2'][i]
        df_cleaned['difficulty_rating_pack'][i] = (df_cleaned['difficulty_rating_1'][i] + df_cleaned['difficulty_rating_2'][i])//2

### Third feautre is combining difficulty and piece count into a single column values

In [None]:
# taking code from madalyn's branch
pieces_df['w_pieces_diff'] = pieces_df['pieces_d1'] + pieces_df['pieces_d2']*2 + pieces_df['pieces_d3']*3 + pieces_df['pieces_d4']*4

# use box cox method to transform weighted pieces by difficulty
from sklearn.preprocessing import power_transform 
pieces_df['w_pieces_diff_transformed'] = power_transform(pieces_df[['w_pieces_diff']], method='box-cox')


In [None]:
# creat list of bin names and label names
# I just picked an arbitrary number of 20 bins, to group the piece difficulty by, this can be improved on
step = (pieces_df['w_pieces_diff_transformed'].max()-pieces_df['w_pieces_diff_transformed'].min())/20
bins = np.arange(pieces_df['w_pieces_diff_transformed'].min(), pieces_df['w_pieces_diff_transformed'].max()+step, step)
labels = ['bin' + s for s in map(str, list(range(len(bins)-1)))]

# label each puzzle by binned piece & difficulty
pieces_df['bin_label'] = pd.cut(x = pieces_df['w_pieces_diff_transformed'], bins = bins, labels = labels, include_lowest = True)
pieces_df.head()

In [None]:
df_cleaned.shape, pieces_df.shape

In [None]:
df_cleaned.columns

In [None]:
concat_df1 = df_cleaned[['pack_name', 'holdtime', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                         'difficulty_rating_2',  'brand_1','piece_count_pack', 'difficulty_rating_pack',
                         'brand_2', 'num_puzzles']]
concat_df2 = pieces_df[['pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4', 'w_pieces_diff_transformed']]

In [None]:
df_features_combined = pd.concat([concat_df1, concat_df2], axis=1)

In [None]:
# setting order and saving file
df_features_combined = df_features_combined[['pack_name', 'piece_count_1', 'piece_count_2', 'difficulty_rating_1', 
                                             'difficulty_rating_2', 'brand_1', 'brand_2', 'num_puzzles', 
                                             'pieces_d1', 'pieces_d2', 'pieces_d3', 'pieces_d4','piece_count_pack',
                                             'difficulty_rating_pack', 'w_pieces_diff_transformed', 'holdtime']]

In [None]:
df_features_combined.to_csv('data/df_features_combined.csv', index=False)