## Feature Engineering

#### Summary: This notebook will take a few characterisitcs and engineer features for them. This ranges from creating columns with binary values to sorting certain variables into general categories.

#### 0. Gender
#### 1. Nationality
#### 2. Genre/Niche
#### 3. Skills
#### 4. Education 
#### 5. MBTI

In [1]:
import pandas as pd
import os 
import re

In [2]:
os.getcwd()
# step out of the "notebooks" folder
p = os.chdir("..")

In [3]:
data = pd.read_csv("data/processed/chatgpt_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Gender,Age,Nationality,Years_Active,Genre_Niche,Beauty_Level,Skills,Education,...,Network_Size,MBTI_Personality,Income_Per_Year,First_Name,Last_Name,Genre_Niche_1,Genre_Niche_2,Skill_1,Skill_2,Skill_3
0,0,Ava Johnson,Female,32,American,7,Lifestyle,8,"Fashion, Photography, Public Speaking",Bachelor's Degree in Marketing,...,5.0,ENFJ,200000,Ava,Johnson,Lifestyle,,Fashion,Photography,Public Speaking
1,1,Jackson Lee,Male,28,British,5,Fitness,7,"Personal Training, Nutrition, Motivational Spe...",High School Diploma,...,4.0,ISTP,150000,Jackson,Lee,Fitness,,Personal Training,Nutrition,Motivational Speaking
2,2,Sophie Garcia,Female,35,Spanish,8,Beauty & Makeup,9,"Makeup Artistry, Skincare, Content Creation",Master's Degree in Cosmetology,...,6.0,INFJ,300000,Sophie,Garcia,Beauty & Makeup,,Makeup Artistry,Skincare,Content Creation
3,3,Ryan Patel,Male,30,Indian,6,Food & Cooking,6,"Culinary Skills, Recipe Development, Food Phot...",Culinary School,...,5.0,ESTP,180000,Ryan,Patel,Food & Cooking,,Culinary Skills,Recipe Development,Food Photography
4,4,Emma Chen,Female,25,Chinese,4,Travel & Adventure,7,"Photography, Writing, Adventure Sports",Bachelor's Degree in Journalism,...,4.0,ENFP,120000,Emma,Chen,Travel & Adventure,,Photography,Writing,Adventure Sports


### 0. Gender

In [4]:
# create a column for men and a column for women
temp = pd.get_dummies(data["Gender"])
data = pd.concat([data, temp], axis = 1).reindex(data.index)
data.drop("Gender", axis = 1, inplace = True)
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Nationality,Years_Active,Genre_Niche,Beauty_Level,Skills,Education,Award_Wins,...,Income_Per_Year,First_Name,Last_Name,Genre_Niche_1,Genre_Niche_2,Skill_1,Skill_2,Skill_3,Female,Male
0,0,Ava Johnson,32,American,7,Lifestyle,8,"Fashion, Photography, Public Speaking",Bachelor's Degree in Marketing,3.0,...,200000,Ava,Johnson,Lifestyle,,Fashion,Photography,Public Speaking,1,0
1,1,Jackson Lee,28,British,5,Fitness,7,"Personal Training, Nutrition, Motivational Spe...",High School Diploma,2.0,...,150000,Jackson,Lee,Fitness,,Personal Training,Nutrition,Motivational Speaking,0,1
2,2,Sophie Garcia,35,Spanish,8,Beauty & Makeup,9,"Makeup Artistry, Skincare, Content Creation",Master's Degree in Cosmetology,5.0,...,300000,Sophie,Garcia,Beauty & Makeup,,Makeup Artistry,Skincare,Content Creation,1,0
3,3,Ryan Patel,30,Indian,6,Food & Cooking,6,"Culinary Skills, Recipe Development, Food Phot...",Culinary School,3.0,...,180000,Ryan,Patel,Food & Cooking,,Culinary Skills,Recipe Development,Food Photography,0,1
4,4,Emma Chen,25,Chinese,4,Travel & Adventure,7,"Photography, Writing, Adventure Sports",Bachelor's Degree in Journalism,2.0,...,120000,Emma,Chen,Travel & Adventure,,Photography,Writing,Adventure Sports,1,0


### 1. Nationality

In [5]:
data["Nationality"].value_counts()

Vietnamese       120
Indian            93
Korean            87
Mexican           81
American          35
Russian           34
Spanish           30
Chinese           25
Pakistani         18
British           17
Saudi Arabian     14
Canadian           9
Egyptian           7
Japanese           5
Omani              5
Brazilian          4
Lebanese           3
Colombian          3
Australian         3
Italian            3
Emirati            2
Kuwaiti            2
Bangladeshi        2
Irish              2
Portuguese         1
Bahraini           1
German             1
Name: Nationality, dtype: int64

In [6]:
# generalize into continent based variables, creating columns for each of these continets for binary values
european = ["British","Italian","Irish", "Portuguese", "German", "Spanish"]
asian = ["Vietnamese", "Indian", "Russian", "Chinese","Japanese","Bangladeshi", "Korean"]
middle_east = ["Pakistani","Bahraini"]
north_american = ["Mexican", "American", "Canadian"]
south_american = ["Brazilian","Colombian"]
oceania = ["Australian"]

def is_european(x): 
    if x['Nationality'] in european:
        return 1
    else:
        return 0
    
def is_asian(x): 
    if x['Nationality'] in asian:
        return 1
    else:
        return 0
    
def is_middle_east(x): 
    if x['Nationality'] in middle_east:
        return 1
    else:
        return 0
    
def is_north_american(x): 
    if x['Nationality'] in north_american:
        return 1
    else: 
        return 0
    
def is_south_american(x): 
    if x['Nationality'] in south_american:
        return 1
    else:
        return 0

def is_oceania(x): 
    if x['Nationality'] in oceania:
        return 1
    else: 
        return 0

data["European"] = data.apply(is_european, axis = 1)
data["Asian"] = data.apply(is_asian, axis = 1)
data["Middle_Eastern"] = data.apply(is_middle_east, axis = 1)
data["North_American"] = data.apply(is_north_american, axis = 1)
data["South_American"] = data.apply(is_south_american, axis = 1)
data["Oceanic"] = data.apply(is_oceania, axis = 1)

In [7]:
data.drop("Nationality", axis = 1, inplace = True)
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Years_Active,Genre_Niche,Beauty_Level,Skills,Education,Award_Wins,Media_Mentions,...,Skill_2,Skill_3,Female,Male,European,Asian,Middle_Eastern,North_American,South_American,Oceanic
0,0,Ava Johnson,32,7,Lifestyle,8,"Fashion, Photography, Public Speaking",Bachelor's Degree in Marketing,3.0,500.0,...,Photography,Public Speaking,1,0,0,0,0,1,0,0
1,1,Jackson Lee,28,5,Fitness,7,"Personal Training, Nutrition, Motivational Spe...",High School Diploma,2.0,300.0,...,Nutrition,Motivational Speaking,0,1,1,0,0,0,0,0
2,2,Sophie Garcia,35,8,Beauty & Makeup,9,"Makeup Artistry, Skincare, Content Creation",Master's Degree in Cosmetology,5.0,600.0,...,Skincare,Content Creation,1,0,1,0,0,0,0,0
3,3,Ryan Patel,30,6,Food & Cooking,6,"Culinary Skills, Recipe Development, Food Phot...",Culinary School,3.0,400.0,...,Recipe Development,Food Photography,0,1,0,1,0,0,0,0
4,4,Emma Chen,25,4,Travel & Adventure,7,"Photography, Writing, Adventure Sports",Bachelor's Degree in Journalism,2.0,250.0,...,Writing,Adventure Sports,1,0,0,1,0,0,0,0


### 2. Genre/Niche

In [8]:
data['Genre_Niche'].unique()

array(['Lifestyle', 'Fitness', 'Beauty & Makeup', 'Food & Cooking',
       'Travel & Adventure', 'Technology', 'Fashion & Beauty',
       'Gaming & Esports', 'Lifestyle & Wellness', 'Parenting & Family',
       'Fashion & Lifestyle', 'Music', 'Fitness & Wellness',
       'Photography & Travel', 'Beauty & Fashion', 'Comedy',
       'Fitness & Nutrition', 'Fashion & Style',
       'Technology & Innovation', 'Music & Entertainment',
       'Culinary Arts', 'Gaming & Streaming', 'Comedy & Entertainment',
       'Beauty & Lifestyle', 'Film & Media Production',
       'Health & Wellness', 'Digital Marketing', 'Lifestyle & Travel'],
      dtype=object)

In [9]:
# generalize into larger categories and creating columns for each
entertainment = ['Music','Film & Media Production','Comedy', 'Music & Entertainment','Comedy & Entertainment','Gaming & Esports','Gaming & Streaming','Comedy']
food = ['Food & Cooking', 'Culinary Arts']
travel = ['Travel & Adventure','Photography & Travel', 'Lifestyle & Travel']
health = ['Fitness','Lifestyle','Lifestyle & Wellness','Health & Wellness','Fitness & Nutrition', 'Parenting & Family', 'Fitness & Wellness']
beauty = ['Beauty & Makeup','Fashion & Beauty','Beauty & Lifestyle', 'Fashion & Lifestyle', 'Fasion & Style']
tech = ['Technology','Digital Marketing', 'Technology & Innovation']


def is_entertainment(x): 
    if x['Genre_Niche'] in entertainment:
        return 1
    else:
        return 0
    
def is_food(x): 
    if x['Genre_Niche'] in food:
        return 1
    else:
        return 0
    
def is_travel(x): 
    if x['Genre_Niche'] in travel:
        return 1
    else:
        return 0
    
def is_health(x): 
    if x['Genre_Niche'] in health:
        return 1
    else: 
        return 0
    
def is_beauty(x): 
    if x['Genre_Niche'] in beauty:
        return 1
    else:
        return 0

def is_tech(x): 
    if x['Genre_Niche'] in tech:
        return 1
    else: 
        return 0

data["Entertainment"] = data.apply(is_entertainment, axis = 1)
data["Food"] = data.apply(is_food, axis = 1)
data["Travel"] = data.apply(is_travel, axis = 1)
data["Health"] = data.apply(is_health, axis = 1)
data["Beauty"] = data.apply(is_beauty, axis = 1)
data["Tech"] = data.apply(is_tech, axis = 1)

In [10]:
data.drop("Genre_Niche", axis = 1, inplace = True)
data.drop("Genre_Niche_1", axis = 1, inplace = True)
data.drop("Genre_Niche_2", axis = 1, inplace = True)
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Years_Active,Beauty_Level,Skills,Education,Award_Wins,Media_Mentions,Social_Media_Followers,...,Middle_Eastern,North_American,South_American,Oceanic,Entertainment,Food,Travel,Health,Beauty,Tech
0,0,Ava Johnson,32,7,8,"Fashion, Photography, Public Speaking",Bachelor's Degree in Marketing,3.0,500.0,1000000.0,...,0,1,0,0,0,0,0,1,0,0
1,1,Jackson Lee,28,5,7,"Personal Training, Nutrition, Motivational Spe...",High School Diploma,2.0,300.0,800000.0,...,0,0,0,0,0,0,0,1,0,0
2,2,Sophie Garcia,35,8,9,"Makeup Artistry, Skincare, Content Creation",Master's Degree in Cosmetology,5.0,600.0,1200000.0,...,0,0,0,0,0,0,0,0,1,0
3,3,Ryan Patel,30,6,6,"Culinary Skills, Recipe Development, Food Phot...",Culinary School,3.0,400.0,900000.0,...,0,0,0,0,0,1,0,0,0,0
4,4,Emma Chen,25,4,7,"Photography, Writing, Adventure Sports",Bachelor's Degree in Journalism,2.0,250.0,600000.0,...,0,0,0,0,0,0,1,0,0,0


### 3. Skills - Ended up Removing

In [11]:
data['Skill_1'].unique()

array(['Fashion', 'Personal Training', 'Makeup Artistry',
       'Culinary Skills', 'Photography', 'Programming', 'Fashion Design',
       'Gaming Skills', 'Yoga', 'Parenting Tips', 'Styling',
       'Songwriting', 'Stand-up Comedy', 'Travel Photography',
       'Fashion Styling', 'Music Production', 'Cooking', 'Travel Writing',
       'Software Engineering', 'Singing', 'Software Development',
       'Travel Blogging', 'Chef', 'Skincare', 'Comedy Writing',
       'Data Science', 'Directing', 'Nutrition Coaching',
       'Social Media Marketing'], dtype=object)

In [12]:
data = data.loc[:,~data.columns.duplicated()].copy()
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Years_Active,Beauty_Level,Skills,Education,Award_Wins,Media_Mentions,Social_Media_Followers,...,Middle_Eastern,North_American,South_American,Oceanic,Entertainment,Food,Travel,Health,Beauty,Tech
0,0,Ava Johnson,32,7,8,"Fashion, Photography, Public Speaking",Bachelor's Degree in Marketing,3.0,500.0,1000000.0,...,0,1,0,0,0,0,0,1,0,0
1,1,Jackson Lee,28,5,7,"Personal Training, Nutrition, Motivational Spe...",High School Diploma,2.0,300.0,800000.0,...,0,0,0,0,0,0,0,1,0,0
2,2,Sophie Garcia,35,8,9,"Makeup Artistry, Skincare, Content Creation",Master's Degree in Cosmetology,5.0,600.0,1200000.0,...,0,0,0,0,0,0,0,0,1,0
3,3,Ryan Patel,30,6,6,"Culinary Skills, Recipe Development, Food Phot...",Culinary School,3.0,400.0,900000.0,...,0,0,0,0,0,1,0,0,0,0
4,4,Emma Chen,25,4,7,"Photography, Writing, Adventure Sports",Bachelor's Degree in Journalism,2.0,250.0,600000.0,...,0,0,0,0,0,0,1,0,0,0


In [13]:
data.drop("Skills", axis = 1, inplace = True)
data.drop("Skill_1", axis = 1, inplace = True)
data.drop("Skill_2", axis = 1, inplace = True)
data.drop("Skill_3", axis = 1, inplace = True)
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Years_Active,Beauty_Level,Education,Award_Wins,Media_Mentions,Social_Media_Followers,Social_Media_Likes,...,Middle_Eastern,North_American,South_American,Oceanic,Entertainment,Food,Travel,Health,Beauty,Tech
0,0,Ava Johnson,32,7,8,Bachelor's Degree in Marketing,3.0,500.0,1000000.0,50000.0,...,0,1,0,0,0,0,0,1,0,0
1,1,Jackson Lee,28,5,7,High School Diploma,2.0,300.0,800000.0,40000.0,...,0,0,0,0,0,0,0,1,0,0
2,2,Sophie Garcia,35,8,9,Master's Degree in Cosmetology,5.0,600.0,1200000.0,70000.0,...,0,0,0,0,0,0,0,0,1,0
3,3,Ryan Patel,30,6,6,Culinary School,3.0,400.0,900000.0,45000.0,...,0,0,0,0,0,1,0,0,0,0
4,4,Emma Chen,25,4,7,Bachelor's Degree in Journalism,2.0,250.0,600000.0,35000.0,...,0,0,0,0,0,0,1,0,0,0


### 4. Education

In [14]:
data["Education"].value_counts()

Bachelor's Degree in Fashion Design               81
Culinary School                                   71
Bachelor's Degree in Music                        69
Bachelor's Degree in Theater Arts                 51
Bachelor's Degree in Computer Science             50
Master's Degree in Computer Science               48
Bachelor's Degree in Journalism                   38
Bachelor's Degree in Tourism                      34
Bachelor's Degree in Early Childhood Education    34
Bachelor's Degree in Cosmetology                  33
Bachelor's Degree in Nutrition                    23
Self-Taught                                       19
Bachelor's Degree in Psychology                   14
Master's Degree in Computer Engineering           11
Bachelor's Degree in Film Production               5
Bachelor's Degree in Exercise Science              4
Bachelor's Degree in Film Studies                  4
Bachelor's Degree in Visual Arts                   4
Bachelor's Degree in Marketing                

In [15]:
# self determined general degrees for categoires of education elvel
education_options = ["Self-Taught", "High School Diploma", "Culinary School", "Bachelor's", "Master's"]

In [16]:
def change_education(x):
    edu = x["Education"]
    op = None
    for i in education_options:
        if i in edu:
            op = i
    return op
    

data["General_Education"] = data.apply(change_education, axis = 1)
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Years_Active,Beauty_Level,Education,Award_Wins,Media_Mentions,Social_Media_Followers,Social_Media_Likes,...,North_American,South_American,Oceanic,Entertainment,Food,Travel,Health,Beauty,Tech,General_Education
0,0,Ava Johnson,32,7,8,Bachelor's Degree in Marketing,3.0,500.0,1000000.0,50000.0,...,1,0,0,0,0,0,1,0,0,Bachelor's
1,1,Jackson Lee,28,5,7,High School Diploma,2.0,300.0,800000.0,40000.0,...,0,0,0,0,0,0,1,0,0,High School Diploma
2,2,Sophie Garcia,35,8,9,Master's Degree in Cosmetology,5.0,600.0,1200000.0,70000.0,...,0,0,0,0,0,0,0,1,0,Master's
3,3,Ryan Patel,30,6,6,Culinary School,3.0,400.0,900000.0,45000.0,...,0,0,0,0,1,0,0,0,0,Culinary School
4,4,Emma Chen,25,4,7,Bachelor's Degree in Journalism,2.0,250.0,600000.0,35000.0,...,0,0,0,0,0,1,0,0,0,Bachelor's


In [17]:
temp = pd.get_dummies(data["General_Education"])
data = pd.concat([data, temp], axis = 1).reindex(data.index)
data.drop(["Education", "General_Education"], axis = 1, inplace = True)
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Years_Active,Beauty_Level,Award_Wins,Media_Mentions,Social_Media_Followers,Social_Media_Likes,Network_Size,...,Food,Travel,Health,Beauty,Tech,Bachelor's,Culinary School,High School Diploma,Master's,Self-Taught
0,0,Ava Johnson,32,7,8,3.0,500.0,1000000.0,50000.0,5.0,...,0,0,1,0,0,1,0,0,0,0
1,1,Jackson Lee,28,5,7,2.0,300.0,800000.0,40000.0,4.0,...,0,0,1,0,0,0,0,1,0,0
2,2,Sophie Garcia,35,8,9,5.0,600.0,1200000.0,70000.0,6.0,...,0,0,0,1,0,0,0,0,1,0
3,3,Ryan Patel,30,6,6,3.0,400.0,900000.0,45000.0,5.0,...,1,0,0,0,0,0,1,0,0,0
4,4,Emma Chen,25,4,7,2.0,250.0,600000.0,35000.0,4.0,...,0,1,0,0,0,1,0,0,0,0


### 5. MBTI

In [18]:
# generalize between introvert and extrovert
def is_int(x): 
    if 'I' in x['MBTI_Personality']:
        return 1
    else: 
        return 0
    
def is_ex(x): 
    if 'E' in x['MBTI_Personality']:
        return 1
    else: 
        return 0
    
data["Introvert"] = data.apply(is_int, axis = 1)
data["Extrovert"] = data.apply(is_ex, axis = 1)

In [19]:

data.drop(['MBTI_Personality'], axis = 1, inplace = True)
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Years_Active,Beauty_Level,Award_Wins,Media_Mentions,Social_Media_Followers,Social_Media_Likes,Network_Size,...,Health,Beauty,Tech,Bachelor's,Culinary School,High School Diploma,Master's,Self-Taught,Introvert,Extrovert
0,0,Ava Johnson,32,7,8,3.0,500.0,1000000.0,50000.0,5.0,...,1,0,0,1,0,0,0,0,0,1
1,1,Jackson Lee,28,5,7,2.0,300.0,800000.0,40000.0,4.0,...,1,0,0,0,0,1,0,0,1,0
2,2,Sophie Garcia,35,8,9,5.0,600.0,1200000.0,70000.0,6.0,...,0,1,0,0,0,0,1,0,1,0
3,3,Ryan Patel,30,6,6,3.0,400.0,900000.0,45000.0,5.0,...,0,0,0,0,1,0,0,0,0,1
4,4,Emma Chen,25,4,7,2.0,250.0,600000.0,35000.0,4.0,...,0,0,0,1,0,0,0,0,0,1


In [20]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Age,Years_Active,Beauty_Level,Award_Wins,Media_Mentions,Social_Media_Followers,Social_Media_Likes,Network_Size,Income_Per_Year,...,Health,Beauty,Tech,Bachelor's,Culinary School,High School Diploma,Master's,Self-Taught,Introvert,Extrovert
count,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,...,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0,607.0
mean,372.512356,30.431631,7.07084,7.790774,2.846787,456.029654,965782.5,53080.724876,4.461285,221004.942339,...,0.130148,0.186161,0.103789,0.746293,0.116969,0.001647,0.103789,0.031301,0.584843,0.415157
std,221.797613,2.475928,1.232102,0.720318,0.706816,80.46822,143295.7,10828.016808,0.5402,34899.359909,...,0.336744,0.389558,0.305238,0.435491,0.321648,0.040589,0.305238,0.174275,0.493155,0.493155
min,0.0,25.0,4.0,6.0,1.0,200.0,500000.0,25000.0,3.0,120000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,207.5,29.0,6.0,7.0,2.0,400.0,850000.0,45000.0,4.0,200000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,359.0,30.0,7.0,8.0,3.0,450.0,950000.0,50000.0,4.0,220000.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,568.5,32.0,8.0,8.0,3.0,500.0,1000000.0,60000.0,5.0,240000.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
max,818.0,40.0,11.0,9.0,5.0,700.0,1500000.0,80000.0,6.0,400000.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
data.to_csv("data/processed/chatgpt_data_final.csv")

#### Please see linear_regression for the next step.