# Mini-Data Set Preparation

After the Kaggle Script "Making a mini-data set" is run (FYI, it takes about 2 minutes to run) to reduce the size of the data to 40,000 instances, run this script to organize data into a single dataframe. 

Run this with the 8 csv files produced by the Kaggle Script in the same directory. 

Note: This is a Python3 script because that is what Kaggle uses. 

In [2]:
import pandas as pd
import numpy as np
import copy
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [5]:
clicks_train = pd.read_csv("../input/clicks_train.csv")
promoted_content = pd.read_csv("../input/promoted_content.csv")
doc_cats = pd.read_csv("../input/documents_categories.csv")
doc_ents = pd.read_csv("../input/documents_entities.csv")
doc_meta = pd.read_csv("../input/documents_meta.csv")
doc_topics = pd.read_csv("../input/documents_topics.csv")
events = pd.read_csv("../input/events.csv")
page_views = pd.read_csv("../input/page_views_sample.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
#LISA/NIKKi: 

# reverse engineer - get document_ids that are in page_views AND promoted_content
# get ads that are in those documents
# get display ids from clicks_train that contain those ads



display_sample = np.random.choice(clicks_train["display_id"].unique(), 4000) # change this if too many rows
clicks_train = clicks_train[clicks_train["display_id"].isin(display_sample)]
# select 4000 random display id's and grab all rows in click_train with that display
# every display has multiple ads and only 1 ad in every display is clicked

promoted = promoted_content[promoted_content["ad_id"].isin(clicks_train["ad_id"])]
# same ad can show up in multiple displays, so length of unique ads < length of unique displays

doc_cats = doc_cats[doc_cats["document_id"].isin(promoted["document_id"])]

doc_ents = doc_ents[doc_ents["document_id"].isin(promoted["document_id"])]

doc_meta = doc_meta[doc_meta["document_id"].isin(promoted["document_id"])]

doc_topics = doc_topics[doc_topics["document_id"].isin(promoted["document_id"])]

events = events[events["display_id"].isin(clicks_train["display_id"])]

page_views = page_views[page_views["document_id"].isin(promoted["document_id"])]
# platform & traffic source need to be either all integers or all strings (right now its mixed)



In [None]:
# Jay's code start here...
%matplotlib inline

import scipy as sp
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import SparsePCA

from sklearn.cluster import AgglomerativeClustering 
from sklearn.cluster import KMeans 

In [113]:

test = pd.read_csv('../input/clicks_test.csv')

# Group the ads and count up how many views and clicks
ad_likelihood = clicks_train.groupby('ad_id')['clicked'].agg(['count','sum']).reset_index()

# Calculate the overall average number of clicks
mean_clicked = clicks_train.clicked.mean()
print("overall average click-rate:", mean_clicked)
# del clicks_train

# Calculate the likelihood of an ad click
ad_likelihood['likelihood'] = (ad_likelihood['sum']) / (ad_likelihood['count'] + 1)

# Left join the ad likelihoods into the training set by ad_id
test = test.merge(ad_likelihood,how='left')

# Fill in the blanks with the overall average likelihood
test.fillna(mean_clicked,inplace=True)

# Sort the rows by the likelihood of each ad_id within each display_id
test.sort_values(['display_id','likelihood'],inplace=True,ascending=False)

print("\nThis is our prior expectation for the ad click likelihoods based on overall click-rates...")
print(test.head(20))

# Format the data the way the submission requires
output=test.groupby(['display_id'])['ad_id'].apply(lambda x:' '.join(map(str,x))).reset_index()

# That's it for the simple solution (prior expectation)!
output.to_csv('simplesolution3.cvs',index=False)


overall average click-rate: 0.1938454082868912

This is our prior expectation for the ad click likelihoods based on overall click-rates.
          display_id   ad_id      count       sum  likelihood
32225159    23120126  145293   8.000000  2.000000    0.222222
32225160    23120126  201649  21.000000  1.000000    0.045455
32225158    23120126   57097   6.000000  0.000000    0.000000
32225161    23120126  292363   5.000000  0.000000    0.000000
32225157    23120125  254698   0.193845  0.193845    0.193845
32225154    23120125   86281   3.000000  0.000000    0.000000
32225155    23120125   89725   2.000000  0.000000    0.000000
32225156    23120125  187797  21.000000  0.000000    0.000000
32225150    23120124   83252   0.193845  0.193845    0.193845
32225153    23120124  489589   0.193845  0.193845    0.193845
32225152    23120124  173403  24.000000  4.000000    0.160000
32225151    23120124  162994   2.000000  0.000000    0.000000
32225146    23120123  174045   0.193845  0.193845    0.19

In [114]:
print("This is the training data...")
clicks_train.head()

This is the training data...


Unnamed: 0,display_id,ad_id,clicked
1036,200,257326,1
1037,200,281284,0
1038,200,292370,0
1039,200,296112,0
1040,200,303522,0


In order to use SVD, we need to reformat that data such that each ad_id is a column and each row is a different display_id.

In [115]:
#This transforms the training data into the format we need!
SVDtable = clicks_train.pivot(index='display_id', columns='ad_id')

#Give the NAs a -1 value for now.  Can replace later as necessary.
SVDtable = SVDtable.fillna('-1')
print(SVDtable.head())

# Converting to a np.array is tricky, so let's put it in a csv first and then re-import it.
SVDtable.to_csv('diditwork.csv',index=True, column_names=False)

           clicked                                                          \
ad_id       7      65     164    489    656    752    801    845    863      
display_id                                                                   
200             -1     -1     -1     -1     -1     -1     -1     -1     -1   
372             -1     -1     -1     -1     -1     -1     -1     -1     -1   
9287            -1     -1     -1     -1     -1     -1     -1     -1     -1   
10823           -1     -1     -1     -1     -1     -1     -1     -1     -1   
14190           -1     -1     -1     -1     -1     -1     -1     -1     -1   

                   ...                                                     \
ad_id      984     ...   544417 544426 544474 545217 545470 545719 546316   
display_id         ...                                                      
200            -1  ...       -1     -1     -1     -1     -1     -1     -1   
372            -1  ...       -1     -1     -1     -1     -1     -1 

In [116]:
# re-import as numpy array for SVD spooky stuff
ff = "diditwork.csv" 
f = open(ff)

junkline = f.readline() # says "clicked" a bunch of times
column_names = f.readline() 
# print(column_names)
anotherjunkline = f.readline() #has "display_id" header with a bunch of NaN entries...

def converter(x):
    if x == -1:
        return np.nan # Why doesn't this work?  -1s show up everywhere
    else:
        return float(x)
    
data = np.loadtxt(f, delimiter=',', converters={1:converter})

print(data[:])


[[  2.00000000e+02  -1.00000000e+00  -1.00000000e+00 ...,  -1.00000000e+00
   -1.00000000e+00  -1.00000000e+00]
 [  3.72000000e+02  -1.00000000e+00  -1.00000000e+00 ...,  -1.00000000e+00
   -1.00000000e+00  -1.00000000e+00]
 [  9.28700000e+03  -1.00000000e+00  -1.00000000e+00 ...,  -1.00000000e+00
   -1.00000000e+00  -1.00000000e+00]
 ..., 
 [  1.68456090e+07  -1.00000000e+00  -1.00000000e+00 ...,  -1.00000000e+00
   -1.00000000e+00  -1.00000000e+00]
 [  1.68659740e+07  -1.00000000e+00  -1.00000000e+00 ...,  -1.00000000e+00
   -1.00000000e+00  -1.00000000e+00]
 [  1.68687850e+07  -1.00000000e+00  -1.00000000e+00 ...,  -1.00000000e+00
   -1.00000000e+00  -1.00000000e+00]]


In [143]:
# Let's set the NAs to 0 for now to match the joke recommender sample code...
data[:, 1:][data[:, 1:] == -1] = 0 # np.nan

print(data[:])
print(data.shape)
print(type(data))

# only one click each row!?
print(np.sum(data[0][1:]))
print(np.sum(data[1][1:]))
print(np.sum(data[2][1:]))
print(np.sum(data[3][1:]))
print(np.sum(data[4][1:]))


[[  2.00000000e+02   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  3.72000000e+02   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  9.28700000e+03   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 ..., 
 [  1.68456090e+07   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  1.68659740e+07   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  1.68687850e+07   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
(4000, 9946)
<class 'numpy.ndarray'>
1.0
1.0
1.0
1.0
1.0


In [118]:
# full svd; note that the first column is dropped since it's the display_id
u, s, v = np.linalg.svd(data[:, 1:], full_matrices=False)

print(u.shape)
print(s.shape)
print(v.shape)

# these are (nearly) the same, that's the decomposition!
print(data[:, 1:])

print(np.dot(np.dot(u, np.diag(s)), v)) # the full reconstruction


(4000, 4000)
(4000,)
(4000, 9945)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [141]:
n_comp = 10

# make a new s vector with only the first few values kept
s_padded = np.zeros(len(s))
s_padded[:n_comp] = s[:n_comp]

filled_in = np.dot(np.dot(u, np.diag(s_padded)), v)

# Okay, this is bad.  Why are the recommendations all zero?  Too sparse??
print(filled_in)

# No recommendations!!!
print(np.sum(filled_in[0][:]))
print(np.sum(filled_in[1][:]))
print(np.sum(filled_in[2][:]))
print(np.sum(filled_in[3][:]))

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
0.0
0.0
0.0
0.0


In [None]:

#Jay's code ends here

## Join clicks_train and events on display_id

In [7]:
data = clicks_train.join(events)
data.head()

ValueError: columns overlap but no suffix specified: Index(['display_id'], dtype='object')

## Promoted

In [7]:
len(promoted)

6042

In [8]:
#there is not a one-to-one relationship between document_id in promoted and the master data
#This is because the same ad is being shown in different documents I think
print(len(promoted["document_id"].unique()))
print(len(data["document_id"].unique()))

4230
7240


In [9]:
#there is a one-to-one relationship between ad_id in promoted and the master data
print(len(promoted["ad_id"].unique())) #each add can appear more than once
print(len(data["ad_id"].unique()))

6042
6042


In [10]:
promoted.head()
del promoted["Unnamed: 0"]
del promoted['document_id'] #I think all we want from here is the link between ad_id and campaign id
promoted.head()

Unnamed: 0,ad_id,campaign_id,advertiser_id
0,489,81,116
1,844,37,63
2,971,28,45
3,1117,205,134
4,1126,209,295


## Joining Info about each ad

I make a dictionary of the advertiser and campaign id for each ad_id, map that dictionary to the ad id to make the advertizer and campain columns

In [19]:
data.head()

Unnamed: 0_level_0,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8112,151682,1,715ddc85c78e39,1792396,559250,2,US>MI>563
10333,157598,0,ec3841c3250fb6,1405372,716674,2,US>MO>616
11278,103712,1,324475e5f191c8,1179111,784022,2,US>GA>524
13320,139944,1,27221a85141108,735143,926961,2,US>CA>807
13905,173393,0,9ea0907cee579e,1794259,971672,2,US>IL>602


In [12]:
print(len(data))
print(len(data["ad_id"].unique())) #adds appear on average slightly more than twice in our minidata set

10000
6042


In [13]:
#make dictionaries to look up advertizer id and campaign id for each ad_id
advertiser_dict = dict(zip(promoted.ad_id, promoted.advertiser_id))
campaign_dict = dict(zip(promoted.ad_id, promoted.campaign_id))


In [14]:
data["campaign_id"] = data["ad_id"].map(campaign_dict)
data["advertiser_id"] = data["ad_id"].map(advertiser_dict)
data.head()

Unnamed: 0_level_0,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8112,151682,1,715ddc85c78e39,1792396,559250,2,US>MI>563,19284,3283
10333,157598,0,ec3841c3250fb6,1405372,716674,2,US>MO>616,18022,2312
11278,103712,1,324475e5f191c8,1179111,784022,2,US>GA>524,5081,1759
13320,139944,1,27221a85141108,735143,926961,2,US>CA>807,17929,1782
13905,173393,0,9ea0907cee579e,1794259,971672,2,US>IL>602,21143,2876


In [15]:
print(len(data))
print(len(data["ad_id"].unique())) #adds appear on average slightly more than twice in our minidata set

10000
6042


## Working with Page Views

Add count of page views to every document in data as a feature, could tell us something about likelihood of ads being clicked

In [61]:
print(len(page_views))
print(len(page_views['document_id'].unique()))
# when we filter our data like this, we only get 23 documents from page_views_sample.. we need the full dataset


36071
23


## Importing Document Information

I'm super stuck on why all the document ids that appear in our data arent in the files with more information about each documents.

In [16]:
#Why aren't there the same number of unique documents in each of these
print(len(data["document_id"].unique()))
print(len(doc_cats["document_id"].unique()))
print(len(doc_ents["document_id"].unique()))
print(len(doc_meta["document_id"].unique()))
print(len(doc_topics["document_id"].unique()))

7240
4223
3540
4230
4154


In [7]:
#each document has multiple possible entities, categories, topics with different confidence level. 
#maybe we should just for now keep the most likely entity, topic and category? 
doc_ents.head()

Unnamed: 0,document_id,entity_id,confidence_level
72655,1807706,11d5279de0b36011773ee50f75f9b43c,0.495961
72656,1807706,358c9e8a2ff80f858f6deb063e0bcf8f,0.337521
72657,1807706,b2a032747f34ec8488b2be2ebe180d9e,0.281354
72673,1316839,e8493cde7d253c7def5b030a2c94357d,0.977918
72674,1316839,c5039d9dda9bccb8ed150d8affedd306,0.37515


In [18]:
doc_cats.head()

Unnamed: 0.1,Unnamed: 0,document_id,category_id,confidence_level
0,51976,1807706,1209,0.92
1,51977,1807706,1205,0.07
2,51990,1316839,2003,0.92
3,51991,1316839,1211,0.07
4,51994,479029,1205,0.92


In [55]:
data[data['document_id']==1405372]

Unnamed: 0_level_0,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10333,157598,0,ec3841c3250fb6,1405372,716674,2,US>MO>616


In [20]:
data["geo_location"] = data["geo_location"].apply(str)

In [21]:
data["country"] = [x[:2] for x in data["geo_location"]]
data["state"] = [x[3:5] for x in data["geo_location"]]
data["loc_num"]= [x[6:] for x in data["geo_location"]]
data["loc_num"] = data["loc_num"].map(str)
data.head()

Unnamed: 0_level_0,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id,country,state,loc_num
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8112,151682,1,715ddc85c78e39,1792396,559250,2,US>MI>563,19284,3283,US,MI,563
10333,157598,0,ec3841c3250fb6,1405372,716674,2,US>MO>616,18022,2312,US,MO,616
11278,103712,1,324475e5f191c8,1179111,784022,2,US>GA>524,5081,1759,US,GA,524
13320,139944,1,27221a85141108,735143,926961,2,US>CA>807,17929,1782,US,CA,807
13905,173393,0,9ea0907cee579e,1794259,971672,2,US>IL>602,21143,2876,US,IL,602


In [22]:
list_to_binarize = ["advertiser_id", "campaign_id", "platform", "country"]

for i in list_to_binarize:
    temp = pd.get_dummies(data[i], prefix = i)
    data = data.join(temp)
data.head()

Unnamed: 0_level_0,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id,country,...,country_TZ,country_UA,country_UG,country_US,country_UZ,country_VI,country_VN,country_ZA,country_ZM,country_ZW
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8112,151682,1,715ddc85c78e39,1792396,559250,2,US>MI>563,19284,3283,US,...,0,0,0,1,0,0,0,0,0,0
10333,157598,0,ec3841c3250fb6,1405372,716674,2,US>MO>616,18022,2312,US,...,0,0,0,1,0,0,0,0,0,0
11278,103712,1,324475e5f191c8,1179111,784022,2,US>GA>524,5081,1759,US,...,0,0,0,1,0,0,0,0,0,0
13320,139944,1,27221a85141108,735143,926961,2,US>CA>807,17929,1782,US,...,0,0,0,1,0,0,0,0,0,0
13905,173393,0,9ea0907cee579e,1794259,971672,2,US>IL>602,21143,2876,US,...,0,0,0,1,0,0,0,0,0,0


In [34]:
clean = copy.deepcopy(data)
clean.head()

Unnamed: 0_level_0,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id,country,...,country_TZ,country_UA,country_UG,country_US,country_UZ,country_VI,country_VN,country_ZA,country_ZM,country_ZW
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8112,151682,1,715ddc85c78e39,1792396,559250,2,US>MI>563,19284,3283,US,...,0,0,0,1,0,0,0,0,0,0
10333,157598,0,ec3841c3250fb6,1405372,716674,2,US>MO>616,18022,2312,US,...,0,0,0,1,0,0,0,0,0,0
11278,103712,1,324475e5f191c8,1179111,784022,2,US>GA>524,5081,1759,US,...,0,0,0,1,0,0,0,0,0,0
13320,139944,1,27221a85141108,735143,926961,2,US>CA>807,17929,1782,US,...,0,0,0,1,0,0,0,0,0,0
13905,173393,0,9ea0907cee579e,1794259,971672,2,US>IL>602,21143,2876,US,...,0,0,0,1,0,0,0,0,0,0


In [35]:
len(data["ad_id"].unique())

6042

In [36]:
len(clean['document_id'].unique())

7240

In [37]:
#drop the non-cleaned up data for now
del clean['country'], clean['state'], clean["ad_id"], clean["uuid"], clean["document_id"],clean["timestamp"], clean["platform"], clean["geo_location"], clean['campaign_id'], clean["advertiser_id"]

In [39]:
len(clean.columns)
clean.head()

Unnamed: 0_level_0,clicked,loc_num,advertiser_id_4,advertiser_id_5,advertiser_id_8,advertiser_id_9,advertiser_id_10,advertiser_id_12,advertiser_id_13,advertiser_id_14,...,country_TZ,country_UA,country_UG,country_US,country_UZ,country_VI,country_VN,country_ZA,country_ZM,country_ZW
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8112,1,563,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
10333,0,616,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11278,1,524,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
13320,1,807,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
13905,0,602,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## Make training and test sets

In [45]:
labels = clean['clicked']
labels = labels.values.reshape(-1,1) # check this please! my python is 3.5 and told me to use values.reshape
del clean['clicked']

In [55]:
train_data = clean[:round(.7*len(clean))]
train_labels = labels[:round(.7*len(clean))]
test_data = clean[round(.7*len(clean)):]
test_labels = labels[round(.7*len(clean)):]

In [56]:
lr = LogisticRegression()
lr.fit(train_data, train_labels)
lr.predict(test_data, test_labels)

# Logistic Regression is not the best model because we have to binarize our features
# to make them numeric. This leads to >5000 columns which is unruly and not optimal.

ValueError: could not convert string to float: 