In [1]:
import os
import tqdm
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xlearn as xl

from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
seed = 42
np.random.seed(seed)

pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',None)

ModuleNotFoundError: ignored

In [0]:
data = pd.read_csv("data/adult.data")
data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','target']
# data.drop(columns=['fnlwgt'], inplace=True)
print(data.shape)
data.head()

(32560, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [0]:
data['target'].value_counts()/data.shape[0]

 <=50K    0.759183
 >50K     0.240817
Name: target, dtype: float64

### PREPROCESSING

Some preprocessing

### DROPPING THE NA VALUES

In [0]:
numerical_features = ['age','education-num','capital-gain','capital-loss','hours-per-week', 'fnlwgt']
categorical_features = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
print(len(numerical_features), len(categorical_features))

for f in categorical_features:
    data[f] = data[f].str.strip()

target_encoder = {data.target.unique()[0]:0,data.target.unique()[1]:1}
data['target'] = data['target'].map(target_encoder)

print(data.shape)
data.head()

6 8
(32560, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0


In [0]:
x, y = train_test_split(data, test_size=0.2, random_state=seed, stratify=data.target.values)
x, z = train_test_split(x, test_size=0.1, random_state=seed, stratify=x.target.values)

train_indices = x.index.values
val_indices = z.index.values
test_indices = y.index.values
print("length of train data",len(x))
print("length of val data",len(z))
print("length of test data",len(y))

length of train data 23443
length of val data 2605
length of test data 6512


## CONVERTING THE DATA TO FFM NEEDED FORMAT

In [0]:
def convert_to_ffm(df,name,numerics,categories,features):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(features)
    
    if os.path.isfile("ffmdata/"+str(name) + "_ffm_all_feats_numerical_dummy.txt"):
        os.remove("ffmdata/"+str(name) + "_ffm_all_feats_numerical_dummy.txt")
    
    with open("ffmdata/"+str(name) + "_ffm_all_feats_numerical_dummy.txt", "w") as text_file:
    
    # Looping over rows to convert each row to libffm format
        for n, r in tqdm.tqdm(enumerate(range(nrows))):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow['target'])) # Set Target Variable here
             
            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                if(catdict[x]==0):
                    datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                else:
                    # For a new field appearing in a training example
                    if(x not in catcodes):
                        catcodes[x] = {}
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
                    # For already encoded fields 
                    elif(datarow[x] not in catcodes[x]):
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
                     
                    code = catcodes[x][datarow[x]]
                    datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

In [0]:
# saving the files
features = numerical_features+categorical_features
convert_to_ffm(data.iloc[train_indices,:].reset_index(drop=True), 'train', numerical_features, categorical_features, features)
convert_to_ffm(data.iloc[val_indices,:].reset_index(drop=True), 'val', numerical_features, categorical_features, features)
convert_to_ffm(data.iloc[test_indices,:].reset_index(drop=True), 'test', numerical_features, categorical_features, features)

23443it [00:05, 4391.51it/s]
2605it [00:00, 4475.66it/s]
6512it [00:01, 4439.67it/s]


### FFM

In [0]:
T1 = datetime.datetime.now()
ffm = xl.create_ffm()

ffm.setTrain("ffmdata/train_ffm_all_feats_numerical_dummy.txt")
ffm.setValidate("ffmdata/val_ffm_all_feats_numerical_dummy.txt")

params = {
          'task':'binary',
          'metric':'f1',
          'init':0.1,
          'epoch':30,
          'k':8,
          'lr':2e-2,
          'lambda':2e-5,
          'opt':'sgd'
         }

print("Model Fitting")
ffm.fit(params, 'ffmdata/model.out')

# predicting on validation
y_val = data.iloc[val_indices,:]['target'].values
ffm.setTest("ffmdata/val_ffm_all_feats_numerical_dummy.txt")  # Test data
ffm.setSigmoid()  # Convert output to 0-1

ffm.predict("ffmdata/model.out", "ffmdata/val_output.txt")
ffm_val_preds = np.loadtxt("ffmdata/val_output.txt")
os.remove("ffmdata/val_output.txt")

# predicting on test
y_test = data.iloc[test_indices,:]['target'].values
ffm.setTest("ffmdata/test_ffm_all_feats_numerical_dummy.txt")
ffm.setSigmoid() # convert output to 0-1

ffm.predict("ffmdata/model.out", "ffmdata/test_output.txt")
ffm_test_preds = np.loadtxt("ffmdata/test_output.txt")
os.remove("ffmdata/test_output.txt")

print("-----FFM-----")
print("log loss on validation data",log_loss(y_val, ffm_val_preds))
print("log loss on test data",log_loss(y_test, ffm_test_preds))
print("F1 score on validation data",f1_score(y_val, (ffm_val_preds>0.5).astype(int)))
print("F1 score on test data",f1_score(y_test, (ffm_test_preds>0.5).astype(int)))
T2 = datetime.datetime.now()
print("Seconds =",round((T2-T1).total_seconds()))

Model Fitting
-----FFM-----
log loss on validation data 0.5077484639277662
log loss on test data 0.5103371674572588
F1 score on validation data 0.30486202365308807
F1 score on test data 0.30220356768100737
Seconds = 1
