In [29]:
#Import items

from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [30]:
# Raw data file
file_to_load = "user_profiles.csv"

# Read purchasing file and store into pandas data frame
df= pd.read_csv(file_to_load)
df

Unnamed: 0.1,Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,income,...,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,0,22,a little extra,strictly anything,socially,never,declined to answer,"asian, white",75.0,29592,...,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,1,35,average,mostly other,often,sometimes,college/university,white,70.0,48630,...,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,2,38,thin,anything,socially,,masters program,,68.0,60812,...,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
3,3,23,thin,vegetarian,socially,,declined to answer,white,71.0,18578,...,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
4,4,29,athletic,,socially,never,college/university,"asian, black, other",66.0,94691,...,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,59941,59,,,socially,never,college/university,,62.0,16612,...,"oakland, california",has kids,straight,has dogs,catholicism but not too serious about it,f,cancer and it&rsquo;s fun to think about,no,english,single
59942,59942,24,fit,mostly anything,often,sometimes,declined to answer,"white, other",72.0,118254,...,"san francisco, california",doesn&rsquo;t have kids,straight,likes dogs and likes cats,agnosticism,m,leo but it doesn&rsquo;t matter,no,english (fluently),single
59943,59943,42,average,mostly anything,not at all,never,masters program,asian,71.0,42318,...,"south san francisco, california",doesn&rsquo;t have kids,straight,,christianity but not too serious about it,m,sagittarius but it doesn&rsquo;t matter,no,english (fluently),single
59944,59944,27,athletic,mostly anything,socially,often,declined to answer,"asian, black",73.0,218886,...,"san francisco, california","doesn&rsquo;t have kids, but wants them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,leo and it&rsquo;s fun to think about,trying to quit,"english (fluently), spanish (poorly), chinese ...",single


In [31]:
# Keep columns that will assist on determining body type.

working_df = df.drop(columns=['sign', 'speaks', 'status', 'last_online', 'income', 'location', 'job', 'education', 'orientation', 'religion', 'ethnicity'])
working_df = working_df.drop(columns=['offspring', 'pets', 'Unnamed: 0'])
working_df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,strictly anything,socially,never,75.0,m,sometimes
1,35,average,mostly other,often,sometimes,70.0,m,no
2,38,thin,anything,socially,,68.0,m,no
3,23,thin,vegetarian,socially,,71.0,m,no
4,29,athletic,,socially,never,66.0,m,no
...,...,...,...,...,...,...,...,...
59941,59,,,socially,never,62.0,f,no
59942,24,fit,mostly anything,often,sometimes,72.0,m,no
59943,42,average,mostly anything,not at all,never,71.0,m,no
59944,27,athletic,mostly anything,socially,often,73.0,m,trying to quit


In [32]:
# Drop NaN values
working_df = working_df.dropna()
working_df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,strictly anything,socially,never,75.0,m,sometimes
1,35,average,mostly other,often,sometimes,70.0,m,no
7,31,average,mostly anything,socially,never,65.0,f,no
9,37,athletic,mostly anything,not at all,never,65.0,m,no
11,28,average,mostly anything,socially,never,72.0,m,no
...,...,...,...,...,...,...,...,...
59935,33,curvy,anything,socially,never,67.0,f,when drinking
59936,25,average,mostly anything,socially,never,61.0,f,no
59942,24,fit,mostly anything,often,sometimes,72.0,m,no
59943,42,average,mostly anything,not at all,never,71.0,m,no


In [33]:
#See what columns we're working with    

list(working_df.columns)

['age', 'body_type', 'diet', 'drinks', 'drugs', 'height', 'sex', 'smokes']

In [34]:
# Remove inaccurate ages

working_df = working_df[working_df.age != 109]
working_df = working_df[working_df.age != 111]

# Combine data that makes sense
working_df = working_df.replace({'diet':{'strictly anything': 'anything',
        'mostly anything': 'anything',
        'strictly halal': 'halal',
        'mostly halal':'halal',
        'strictly kosher': 'kosher',
        'mostly kosher':'kosher',
        'strictly vegan':'vegan',
        'mostly vegan': 'vegan',
        'strictly vegetarian':'vegetarian',
        'mostly vegetarian':'vegetarian',
        'strictly other': 'other',
        'mostly other': 'other'}})


# Remove values that have declined to answer, since they will be unhelpful        

values = ['declined to answer']        

working_df = working_df[working_df.age.isin(values) == False]
working_df = working_df[working_df.diet.isin(values) == False]
working_df = working_df[working_df.body_type.isin(values) == False]
working_df = working_df[working_df.drinks.isin(values) == False]
working_df = working_df[working_df.drugs.isin(values) == False]
working_df = working_df[working_df.height.isin(values) == False]
working_df = working_df[working_df.sex.isin(values) == False]
working_df = working_df[working_df.smokes.isin(values) == False]
working_df = working_df[working_df.body_type != 'rather not say']

working_df


Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,anything,socially,never,75.0,m,sometimes
1,35,average,other,often,sometimes,70.0,m,no
7,31,average,anything,socially,never,65.0,f,no
9,37,athletic,anything,not at all,never,65.0,m,no
11,28,average,anything,socially,never,72.0,m,no
...,...,...,...,...,...,...,...,...
59935,33,curvy,anything,socially,never,67.0,f,when drinking
59936,25,average,anything,socially,never,61.0,f,no
59942,24,fit,anything,often,sometimes,72.0,m,no
59943,42,average,anything,not at all,never,71.0,m,no


In [35]:
working_df.to_csv('gender_guesser_cleaned',index=False)

In [36]:
# Get unique body type values
working_df['body_type'].value_counts()

average           6802
fit               5742
athletic          5366
thin              2191
curvy             1843
a little extra    1312
skinny             804
full figured       464
overweight         227
jacked             191
used up            182
Name: body_type, dtype: int64

Because there is an extreme amount of average values, as compared to overweight & jacked, then we can assume that our model will be more inclined to predict average body types as opposed to the other types.

In [37]:
# Clean up body type values
cleaned_BT_df = working_df.replace({'body_type': {'athletic':'fit', 'full figured':'curvy', 'a little extra':'curvy', 'jacked':'fit', 'skinny':'thin'}})
cleaned_BT_df['body_type'].value_counts()

fit           11299
average        6802
curvy          3619
thin           2995
overweight      227
used up         182
Name: body_type, dtype: int64

In [38]:
# Get unique values of diet
cleaned_BT_df['smokes'].value_counts()

no                20349
sometimes          1679
when drinking      1425
yes                1023
trying to quit      648
Name: smokes, dtype: int64

In [39]:
cleaned_BT_df['age'].value_counts()

26    1577
27    1533
25    1441
28    1396
24    1382
29    1325
30    1264
23    1176
31    1091
32    1037
22     906
33     883
34     747
35     676
21     623
36     616
37     582
38     534
20     491
39     446
42     433
41     409
40     391
43     362
19     301
44     289
45     280
46     226
48     223
47     213
49     201
50     194
52     173
18     165
51     155
54     133
56     130
53     116
59     113
57     108
55     107
61      96
58      96
60      94
62      77
63      68
65      57
66      57
64      55
67      30
68      27
69      19
Name: age, dtype: int64

In [40]:
cleaned_BT_df.shape

(25124, 8)

# Testing with Target Data = Body Types

In [41]:
# Separate the dataset into data and target
X = cleaned_BT_df.drop(['body_type'], axis=1)
y = cleaned_BT_df['body_type']

In [42]:
# Do get dummies for data and label encoder for target
X_dummies = pd.get_dummies(X)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
y_label = LabelEncoder().fit_transform(cleaned_BT_df['body_type'])
y_label

array([1, 0, 0, ..., 2, 0, 2])

In [43]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label,test_size=.5)

In [44]:
# Test with Random Forest to see how well it works

RF_clf = RandomForestClassifier(n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {RF_clf.score(X_train, y_train)}')
print(f'Testing Score: {RF_clf.score(X_test, y_test)}')

Training Score: 0.7518707212227352
Testing Score: 0.4005731571405827


With Random Forest, the Training went pretty well, but the resulting testing score was horrible. Unsure if it's because of the data or if it's because of the model.
We'll test it with other models and see how those results pan out.

In [45]:
# Test with Logistic Regression

from sklearn.linear_model import LogisticRegression
LR_clf = LogisticRegression()

LR_clf.fit(X_train, y_train)

print(f"Training Data Score: {LR_clf.score(X_train, y_train)}")
print(f"Testing Data Score: {LR_clf.score(X_test, y_test)}")

Training Data Score: 0.45820729183251074
Testing Data Score: 0.45597834739691134


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


With Logistic Regression, even though there's more than one target data, just to see how it fares, it results in both training and testing being giving horrible results. This is most likely due to the model being better suited to work with target datas with two values (i.e. "Yes" & "No")

For now, we'll try using Neural Network.

In [46]:
# Create scaler instance
import sklearn as skl
import tensorflow as tf

X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [47]:
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

In [48]:
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=5, activation="relu", input_dim=X_train_scaled.shape[1]))

In [49]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="softmax"))

activation = softmax
Softmax is like Sigmoid but for more than 2 categories.
Sigmoid is for predicting 0 and 1 values.

In [50]:
# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 5)                 125       
                                                                 
 dense_1 (Dense)             (None, 1)                 6         
                                                                 
Total params: 131
Trainable params: 131
Non-trainable params: 0
_________________________________________________________________


loss = categorical_crossentropy
categorical is for more than 2 values

In [51]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

So even with the Neural Networking, the score plateau'd and stayed consistently at low accuracy. This is looking more like an issue with the data rather than the models since it's bad with each selected model so far.

To switch things up, instead of body types, I'll switch the target data to be the Sex instead.

In [62]:
# Separate the dataset into data and target
X = cleaned_BT_df.drop(['sex'], axis=1)
y = cleaned_BT_df['sex']

In [63]:
# Do get dummies for data and label encoder for target
X_dummies = pd.get_dummies(X)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
y_label = LabelEncoder().fit_transform(cleaned_BT_df['sex'])
y_label

array([1, 1, 0, ..., 1, 1, 1])

In [64]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label,test_size=.5)

In [65]:
# Test with Random Forest to see how well it works

RF_clf = RandomForestClassifier(n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {RF_clf.score(X_train, y_train)}')
print(f'Testing Score: {RF_clf.score(X_test, y_test)}')

Training Score: 0.9369527145359019
Testing Score: 0.8183410284986468


Surprisingly, the accuracy is great when the models are subjected to pick either Male or Female. Perhaps it's because of it's requiring to pick between two values instead of six? For consistency sake, I'll rerun the other models as well to see if RandomForest's good scores were a fluke.

In [66]:
# Test with Logistic Regression

from sklearn.linear_model import LogisticRegression
LR_clf = LogisticRegression()

LR_clf.fit(X_train, y_train)

print(f"Training Data Score: {LR_clf.score(X_train, y_train)}")
print(f"Testing Data Score: {LR_clf.score(X_test, y_test)}")

Training Data Score: 0.8364113994586849
Testing Data Score: 0.8420633657060977


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
# Create scaler instance
import sklearn as skl
import tensorflow as tf

X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [68]:
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

In [69]:
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=5, activation="relu", input_dim=X_train_scaled.shape[1]))

In [70]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

In [71]:
# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 5)                 145       
                                                                 
 dense_5 (Dense)             (None, 1)                 6         
                                                                 
Total params: 151
Trainable params: 151
Non-trainable params: 0
_________________________________________________________________


In [72]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


With our target data being Sex, we managed to have a higher accuracy score for all three models. It's interesting since this having less target values to predict might be better, in correlation with the data we have. With having more target values, in this case the body types, the models we selected did poorly regarding predicting the correct body types. In the future, we might test with more models to see if the there are any that are more compatible with our data, or if we can clean this to be more concise so we can introduce more variables that may help with future predictions.