In [117]:
import numpy as np
import pandas as pd


In [118]:
df = pd.read_csv("/kaggle/input/cleaned-online-sex-work/cleaned_online_sex_work.csv")

In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28831 entries, 0 to 28830
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   User_ID                              28831 non-null  int64  
 1   Female                               28831 non-null  bool   
 2   Age                                  28831 non-null  float64
 3   Location                             28831 non-null  object 
 4   Verification                         28831 non-null  bool   
 5   Heterosexual                         28831 non-null  int64  
 6   Homosexual                           28831 non-null  int64  
 7   bicurious                            28831 non-null  int64  
 8   bisexual                             28831 non-null  int64  
 9   Dominant                             28831 non-null  int64  
 10  Submisive                            28831 non-null  int64  
 11  Switch                      

In [120]:
truth_map = {True: 1, False: 0}

df['Female'] = df['Female'].map(truth_map)
df['Verification'] = df['Verification'].map(truth_map)

#too many categories, so we take the top 10 and make dummies
top_10 = df['Location'].value_counts().head(10).index.to_list()

def dummies(d, var, top10):
    for label in top10:
        d[var+'_'+label] = np.where(df[var] == label, 1, 0)

dummies(df, 'Location', top_10)
df.drop('Location', axis = 1, inplace = True)

df.drop(['User_ID', 'Friends_ID_list'], axis = 1, inplace=True)

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28831 entries, 0 to 28830
Data columns (total 37 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Female                               28831 non-null  int64  
 1   Age                                  28831 non-null  float64
 2   Verification                         28831 non-null  int64  
 3   Heterosexual                         28831 non-null  int64  
 4   Homosexual                           28831 non-null  int64  
 5   bicurious                            28831 non-null  int64  
 6   bisexual                             28831 non-null  int64  
 7   Dominant                             28831 non-null  int64  
 8   Submisive                            28831 non-null  int64  
 9   Switch                               28831 non-null  int64  
 10  Men                                  28831 non-null  int64  
 11  Men_and_Women               

In [122]:
from tensorflow import keras
import keras_tuner as kt

In [123]:
model = keras.models.Sequential()

In [124]:
l_df = df.dropna()

In [125]:
def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Flatten())
    # Tune the number of layers.
    for i in range(hp.Int("num_layers", 1, 10)):
        model.add(
            keras.layers.Dense(
                # Tune number of units separately.
                units=hp.Int(f"units_{i}", min_value=10, max_value=400, step=10),
                activation=hp.Choice("activation", ["relu", "tanh", "swish"]),
            )
        )
    if hp.Boolean("dropout"):
        model.add(keras.layers.Dropout(rate=0.20))
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model


model = build_model(kt.HyperParameters())

In [126]:
def get_best_model(tuner):
    models = tuner.get_best_models(num_models=2)
    best_model = models[0]
    best_model.build(input_shape=(None, 18))
    return best_model

In [127]:
  def define_tuner():
    return kt.RandomSearch(
        hypermodel=build_model,
        objective="val_accuracy",
        max_trials=12,
        executions_per_trial=1,
        overwrite=True,
        directory="my_dir",
        project_name="helloworld",
    )

In [128]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def scaled_splits(df):
    X = df.drop(['Risk'], axis =1)
    y = df['Risk'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 50)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    y_train = y_train.reshape(y_train.shape[0], 1)
    return X_train, X_test, y_train, y_test

In [129]:
def view_splits(df):
    X = df.drop('Risk', axis = 1)
    l = list(X.columns)
    np.random.shuffle(l)
    return l[:18], l[18:]

In [130]:
x1, x2 = view_splits(df)
v1 = df[x1+['Risk']]
v2 = df[x2+['Risk']]

In [131]:
x1, x2

(['Men',
  'Location_L',
  'Number_of_Comments_in_public_forum',
  'Location_G',
  'Number of Friends',
  'bicurious',
  'Location_B',
  'bisexual',
  'Location_K',
  'Location_H',
  'Men_and_Women',
  'Nobody_but_maybe',
  'Member_since_month',
  'Homosexual',
  'Age',
  'Location_E',
  'Dominant',
  'Member_since_year'],
 ['Last_login',
  'Verification',
  'Nobody',
  'Time_spent_chating_H:M',
  'Number_of_offline_meetings_attended',
  'Member_since_day',
  'Heterosexual',
  'Female',
  'Location_A',
  'Profile_pictures',
  'Points_Rank',
  'Women',
  'Location_C',
  'Switch',
  'Submisive',
  'Location_M',
  'Location_O',
  'Number_of_advertisments_posted'])

In [132]:
v1_l = v1.dropna()
v2_l = v2.dropna()
v1_u = v1[v1['Risk'].isna()]
v2_u = v2[v2['Risk'].isna()]

In [133]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

In [134]:
#gets pseudo-labels based on partially-trained model

def get_pseudo_labels(model, unlabelled_df):
    return model.predict(unlabelled_df)

In [135]:
# filters prediction based on how confident the model is in predicting the output

def get_confident_labels(pseudolabels):
    confident_labels = []
    confidence_level = np.percentile(pseudolabels, 95)
    for i in pseudolabels:
        value = i[0]
        if value >= confidence_level:
            confident_labels.append([1, 1])
        elif value <= 1-confidence_level:
            confident_labels.append([0, 1])
        else:
            confident_labels.append([value, 0])
    return pd.DataFrame(confident_labels, columns=["Risk", "confident"])


In [136]:
def co_training(labelled_view1, unlabelled_view1, labelled_view2, unlabelled_view2, current_model1 = None, current_model2 = None):
    
    if len(unlabelled_view1) > 0 and len(unlabelled_view2) > 0:
        print(0)
        v1_l_xtrain, v1_l_xtest, v1_l_ytrain, v1_l_ytest = scaled_splits(labelled_view1)
        tuner1 = define_tuner()
        tuner1.search(v1_l_xtrain, v1_l_ytrain, epochs = 20, validation_data = (v1_l_xtest,v1_l_ytest))
        best_hps1 = tuner1.get_best_hyperparameters(5)
        model1 = build_model(best_hps1[0])

        unlabelled_view1.drop('Risk', axis = 1, inplace = True)
        pseudolabels1 = get_pseudo_labels(model1, unlabelled_view1)

        confidence_df1 = get_confident_labels(pseudolabels1)
        u1_conf = pd.concat([confidence_df1.reset_index(), unlabelled_view1.reset_index()], axis = 1)
        conf1 = u1_conf[u1_conf['confident'] == 1]
        unconf1 = u1_conf[u1_conf['confident'] == 0]
        u1_conf_indexes = conf1['index'].values[:,-1]

        v2_l_xtrain, v2_l_xtest, v2_l_ytrain, v2_l_ytest = scaled_splits(labelled_view2)
        tuner2 = define_tuner()
        tuner2.search(v2_l_xtrain, v2_l_ytrain, epochs = 20, validation_data = (v2_l_xtest,v2_l_ytest))
        best_hps2 = tuner2.get_best_hyperparameters(5)
        model2 = build_model(best_hps2[0])

        unlabelled_view2.drop('Risk', axis = 1, inplace = True)
        pseudolabels2 = get_pseudo_labels(model2, unlabelled_view2)

        confidence_df2 = get_confident_labels(pseudolabels2)
        u2_conf = pd.concat([confidence_df2.reset_index(), unlabelled_view2.reset_index()], axis = 1)
        conf2 = u2_conf[u2_conf['confident'] == 1]
        unconf2 = u2_conf[u2_conf['confident'] == 0]
        u2_conf_indexes = conf2['index'].values[:,-1]

        confident_labels1 = pd.concat([v1_u.loc[u2_conf_indexes].reset_index(),conf2['Risk'].reset_index()], axis = 1)
        unconfident_labels1 = u1_conf[u1_conf['confident'] == 0]

        confident_labels2 = pd.concat([v2_u.loc[u1_conf_indexes].reset_index(),conf1['Risk'].reset_index()], axis = 1)
        unconfident_labels2 = u2_conf[u2_conf['confident'] == 0]

        new_unlabelled_view1 = unconf1.drop(['confident', 'index'], axis = 1)
        new_unlabelled_view2 = unconf2.drop(['confident', 'index'], axis = 1)

        new_labelled_view1 = pd.concat([labelled_view1, confident_labels1.drop(['index'], axis = 1)], axis = 0)
        new_labelled_view2 = pd.concat([labelled_view2, confident_labels2.drop(['index'], axis = 1)], axis = 0)
        
        return {'v1_l': new_labelled_view1,
                'v1_u': new_unlabelled_view1.reset_index(),
                'v2_l': new_labelled_view2,
                'v2_u': new_unlabelled_view2.reset_index(),
                'model1': model1,
                'model2': model2}
    
    elif len(unlabelled_view1 > 0) and len(unlabelled_view2 == 0):
        print(1)
        xtrain, xtest, ytrain, ytest = scaled_splits(labelled_view1)
        tuner = define_tuner()
        tuner.search(xtrain, ytrain, epochs = 20, validation_data = (xtest,ytest), callbacks=[keras.callbacks.TensorBoard("/tmp/tb_logs")],)
        best_hps = tuner.get_best_hyperparameters(5)
        model = build_model(best_hps[0])

        unlabelled_view1 = unlabelled_view1.drop('Risk', axis = 1)
        pseudolabels = get_pseudo_labels(model, unlabelled_view1)

        confidence_df = get_confident_labels(pseudolabels)
        u_conf = pd.concat([confidence_df.reset_index(), unlabelled_view1.reset_index()], axis = 1)

        unconfident_rows = u_conf[u_conf['confident'] == 0]
        confident_rows = u_conf[u_conf['confident'] == 1]
        unconfident_rows.drop('index', axis = 1, inplace = True)
        confident_rows.drop('index', axis = 1, inplace = True)

        new_labelled_view1 = pd.concat([labelled_view1, confident_rows.drop(['confident'], axis = 1)], axis = 0)
        new_unlabelled_view1 = unconfident_rows.drop('confident', axis = 1)
        
        new_labelled_view2 = labelled_view2
        new_unlabelled_view2 = unlabelled_view2
        
        xtrain, xtest, ytrain, ytest = scaled_splits(labelled_view2)
        tuner = define_tuner()
        tuner.search(xtrain, ytrain, epochs = 20, validation_data = (xtest,ytest), callbacks=[keras.callbacks.TensorBoard("/tmp/tb_logs")],)
        best_hps = tuner.get_best_hyperparameters(5)
        model2 = build_model(best_hps[0])
        
        return {'v1_l': new_labelled_view1,
                'v1_u': new_unlabelled_view1.reset_index(),
                'v2_l': new_labelled_view2,
                'v2_u': new_unlabelled_view2.reset_index(),
                'model1': model1,
                'model2': model2}
    
    elif len(unlabelled_view1 == 0) and len(unlabelled_view2 > 0):
        print(2)
        xtrain, xtest, ytrain, ytest = scaled_splits(labelled_view2)
        tuner = define_tuner()
        tuner.search(xtrain, ytrain, epochs = 20, validation_data = (xtest,ytest), callbacks=[keras.callbacks.TensorBoard("/tmp/tb_logs")],)
        best_hps = tuner.get_best_hyperparameters(5)
        model2 = build_model(best_hps[0])

        unlabelled_view2 = unlabelled_view2.drop('Risk', axis = 1)
        pseudolabels = get_pseudo_labels(model2, unlabelled_view2)

        confidence_df = get_confident_labels(pseudolabels)
        u_conf = pd.concat([confidence_df.reset_index(), unlabelled_view2.reset_index()], axis = 1)

        unconfident_rows = u_conf[u_conf['confident'] == 0]
        confident_rows = u_conf[u_conf['confident'] == 1]
        unconfident_rows.drop('index', axis = 1, inplace = True)
        confident_rows.drop('index', axis = 1, inplace = True)

        new_labelled_view2 = pd.concat([labelled_view2, confident_rows.drop(['confident'], axis = 1)], axis = 0)
        new_unlabelled_view2 = unconfident_rows.drop('confident', axis = 1)
        
        new_labelled_view1 = labelled_view1
        new_unlabelled_view1 = unlabelled_view1
        
        xtrain, xtest, ytrain, ytest = scaled_splits(labelled_view1)
        tuner = define_tuner()
        tuner.search(xtrain, ytrain, epochs = 20, validation_data = (xtest,ytest), callbacks=[keras.callbacks.TensorBoard("/tmp/tb_logs")],)
        best_hps = tuner.get_best_hyperparameters(5)
        model1 = build_model(best_hps[0])
        
        return {'v1_l': new_labelled_view1,
                'v1_u': new_unlabelled_view1.reset_index(),
                'v2_l': new_labelled_view2,
                'v2_u': new_unlabelled_view2.reset_index(),
                'model1': model1,
                'model2': model2}
    else:
        print(3)
        return {
            'model1': current_model1,
            'model2': current_model2
        } 
    

In [137]:
iter1 = co_training(v1_l, v1_u, v2_l, v2_u)

Trial 12 Complete [00h 00m 01s]
val_accuracy: 0.8333333134651184

Best val_accuracy So Far: 0.8333333134651184
Total elapsed time: 00h 00m 21s


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [138]:
iter1['v1_l']

Unnamed: 0,Men,Location_L,Number_of_Comments_in_public_forum,Location_G,Number of Friends,bicurious,Location_B,bisexual,Location_K,Location_H,Men_and_Women,Nobody_but_maybe,Member_since_month,Homosexual,Age,Location_E,Dominant,Member_since_year,Risk
0,1,0,32,0,1,0,0,0,0,0,0,0,9,1,34.6,0,0,2012,0.0
1,0,0,710,0,7,0,0,0,0,0,0,0,11,0,32.2,0,1,2009,0.0
2,0,0,25,0,3,0,0,0,1,0,0,0,4,0,33.6,0,1,2013,0.0
3,0,0,107,0,12,0,0,0,0,1,0,0,4,0,34.0,0,1,2013,0.0
4,0,0,600,0,35,0,1,0,0,0,0,0,4,0,39.5,0,1,2013,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21956,0,0,0,0,1,0,0,1,0,0,1,0,9,0,32.4,0,1,2012,0.0
21957,0,0,0,0,13,0,0,0,0,0,0,0,9,0,58.5,0,1,2012,1.0
21958,0,0,0,0,1,0,0,0,0,0,0,0,9,0,50.5,0,0,2012,0.0
21959,0,0,0,0,1,1,0,0,0,0,1,0,9,0,46.4,0,0,2012,0.0


In [139]:
iter1['v2_l']

Unnamed: 0,Last_login,Verification,Nobody,Time_spent_chating_H:M,Number_of_offline_meetings_attended,Member_since_day,Heterosexual,Female,Location_A,Profile_pictures,Points_Rank,Women,Location_C,Switch,Submisive,Location_M,Location_O,Number_of_advertisments_posted,Risk
0,10,0,0,2,0,17,0,0,1,0,50,0,0,1,0,0,0,0,0.0
1,1,0,0,225,0,1,1,0,0,0,518,1,0,0,0,0,0,9,0.0
2,3,0,0,135,1,1,1,0,0,45,150,1,0,0,0,0,0,1,0.0
3,4,0,0,21562,0,8,1,0,0,1,114,1,0,0,0,0,0,1,0.0
4,5,0,0,21,6,14,1,0,0,8,497,1,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,1,1,0,236075,5,30,0,1,1,2,586,0,0,0,1,0,0,0,1.0
1438,1,1,0,67099,0,2,1,0,0,0,75,1,0,0,0,0,0,0,1.0
1439,669,0,0,0,0,22,0,0,0,0,0,0,0,0,1,0,0,0,1.0
1440,1,0,0,129591,4,23,0,1,1,1,83,0,0,0,1,0,0,1,1.0


In [140]:
iter1['v1_u']

Unnamed: 0,index,Risk,Men,Location_L,Number_of_Comments_in_public_forum,Location_G,Number of Friends,bicurious,Location_B,bisexual,Location_K,Location_H,Men_and_Women,Nobody_but_maybe,Member_since_month,Homosexual,Age,Location_E,Dominant,Member_since_year
0,0,0.999194,0,0,0,0,1,0,0,0,0,0,0,0,9,0,42.9,0,1,2012
1,1,0.999169,0,0,0,0,1,0,1,0,0,0,0,0,9,0,47.4,0,1,2012
2,2,0.999228,0,0,0,0,1,0,0,0,0,0,0,1,9,0,35.2,0,1,2012
3,3,0.999246,0,0,0,0,1,0,1,0,0,0,0,0,9,0,28.6,0,0,2012
4,4,0.999141,0,1,0,0,1,0,0,0,0,0,0,0,9,0,56.6,0,1,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27294,28736,0.999257,0,0,0,0,1,0,0,0,0,0,0,0,9,0,27.2,0,0,2012
27295,28737,0.999176,0,0,0,0,1,1,0,0,0,0,1,0,9,0,46.4,0,0,2012
27296,28738,0.999251,0,0,0,0,1,0,0,0,0,0,0,0,9,0,28.8,0,0,2012
27297,28739,0.999244,1,0,0,0,1,0,0,1,0,0,0,0,9,0,31.5,0,0,2012


In [141]:
iter1['v2_u']

Unnamed: 0,index,Risk,Last_login,Verification,Nobody,Time_spent_chating_H:M,Number_of_offline_meetings_attended,Member_since_day,Heterosexual,Female,Location_A,Profile_pictures,Points_Rank,Women,Location_C,Switch,Submisive,Location_M,Location_O,Number_of_advertisments_posted
0,2,0.108086,49,0,0,1,0,6,1,0,0,0,0,0,0,0,0,0,0,0
1,8,0.808544,37,0,1,0,0,6,0,0,0,0,43,0,0,0,1,0,0,1
2,9,0.586407,5,0,1,0,0,6,1,0,0,0,15,0,0,0,1,1,0,2
3,10,0.370947,3,0,1,0,0,6,1,0,0,0,0,0,0,0,0,1,0,0
4,33,0.355926,8,0,0,0,0,6,1,0,1,0,0,1,0,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6775,28731,0.357027,12,0,0,0,0,5,1,0,1,0,0,1,0,0,1,0,0,1
6776,28735,0.373650,42,0,0,0,0,5,1,0,1,0,15,1,0,0,0,0,0,0
6777,28736,0.249268,23,0,0,3,0,5,1,0,0,0,0,1,0,1,0,0,0,1
6778,28738,0.511215,4,0,0,0,0,5,1,0,0,0,15,1,1,0,1,0,0,0


In [None]:
iter2 = co_training(iter1['v1_l'], iter1['v1_u'], iter1['v2_l'], iter1['v2_u'], iter1['model1'], iter2['model1'])

0

Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
8                 |?                 |num_layers
60                |?                 |units_0
swish             |?                 |activation
True              |?                 |dropout
0.00033775        |?                 |lr

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
iter2