In [44]:
import numpy as np
import pandas as pd

data_raw = pd.DataFrame.from_csv('data_raw/titanic_train.csv')

In [45]:
import numpy as np
from tqdm import tqdm

def create_dataset_from_data_column(iterable, label, vector_dim=100, num_rows=5000):
    iterable = np.array(iterable)    
    choice_range = len(iterable)
    iterable_str = iterable.astype(str)
    
    def contains_time_characters(string):
        time_chars = {':', '/', 
                     'hr', 'hour', 'min', 'minute', 'sec', 'second',
                     'day', 'week', 'year'}
        for char in time_chars:
            if char in string:
                return 1
        return 0
    
    vector_list = []
    for i in tqdm(list(range(num_rows))):
        indices = np.random.choice(choice_range, vector_dim)
        stringified_data = iterable_str[indices]
        
        length_data = np.vectorize(len)(stringified_data)
        sum_data = np.vectorize(lambda x: sum([ord(char) for char in x]))(stringified_data)
        avg_data = sum_data / length_data
        std_data = np.vectorize(lambda x: np.array([ord(char) for char in x]).std())(stringified_data)
        float_data = np.vectorize(lambda x: 1 if '.' in x else 0)(stringified_data)
        time_data = np.vectorize(contains_time_characters)(stringified_data)
        vec = np.concatenate((length_data, sum_data, avg_data, std_data, float_data, time_data))
        vector_list.append(vec)
        
    return np.array(vector_list), np.array([label] * num_rows)

In [46]:
data1 = create_dataset_from_data_column(data_raw['Survived'], 2)
data2 = create_dataset_from_data_column(data_raw['Age'], 0)
data3 = create_dataset_from_data_column(data_raw['Sex'], 1)
data4 = create_dataset_from_data_column(data_raw['Pclass'], 2)
data5 = create_dataset_from_data_column(data_raw['Name'], 1)
data6 = create_dataset_from_data_column(data_raw['SibSp'], 0)
data7 = create_dataset_from_data_column(data_raw['Fare'], 0)
data8 = create_dataset_from_data_column(data_raw['Ticket'], 1)
data9 = create_dataset_from_data_column(data_raw['Embarked'], 2)

100%|██████████| 5000/5000 [00:51<00:00, 97.91it/s] 
100%|██████████| 5000/5000 [00:54<00:00, 92.42it/s] 
100%|██████████| 5000/5000 [00:49<00:00, 100.62it/s]
100%|██████████| 5000/5000 [00:43<00:00, 115.13it/s]
100%|██████████| 5000/5000 [01:05<00:00, 75.98it/s]
100%|██████████| 5000/5000 [00:54<00:00, 91.67it/s] 
100%|██████████| 5000/5000 [00:53<00:00, 94.07it/s] 
100%|██████████| 5000/5000 [01:04<00:00, 77.65it/s] 
100%|██████████| 5000/5000 [00:47<00:00, 104.25it/s]


In [47]:
features, label = np.concatenate((data1[0], data2[0], data3[0], data4[0], data5[0], data6[0], data7[0], data8[0], data9[0]), axis=0), np.concatenate((data1[1], data2[1], data3[1], data4[1], data5[1], data6[1], data7[1], data8[1], data9[1]))

In [57]:
features.shape, label.shape

((45000, 600), (45000,))

In [49]:
import pickle

with open('data/features.pkl', 'wb') as handle:
    pickle.dump(features, handle)
with open('data/label.pkl', 'wb') as handle:
    pickle.dump(label, handle)

In [50]:
features.shape, label.shape

((45000, 600), (45000,))

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=1113)

ova_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=128, n_jobs=-1, random_state=1113))
ova_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=1113, verbose=0,
            warm_start=False),
          n_jobs=1)

In [52]:
ova_clf.score(X_train, y_train), ova_clf.score(X_test, y_test) 

(1.0, 0.99992592592592588)

In [None]:
y_test[:20]

In [None]:
y_pred[:20]

In [53]:
len(y_test) * (1 - ova_clf.score(X_test, y_test))

1.000000000000556

In [None]:
X[:10, :]

In [62]:
from collections import Counter

def get_data_column_type(iterable, estimator, robustness=0.1, vector_dim=100):        
    iterable = np.array(iterable)
    choice_range = len(iterable)
    iterable_str = iterable.astype(str)
        
    def contains_time_characters(string):
        time_chars = {':', '/', 
                     'hr', 'hour', 'min', 'minute', 'sec', 'second',
                     'day', 'week', 'year'}
        for char in time_chars:
            if char in string:
                return 1
        return 0
    
    vector_list = []
    for i in (range(int(100 * robustness))):
        indices = np.random.choice(choice_range, vector_dim)
        stringified_data = iterable_str[indices]
        
        length_data = np.vectorize(len)(stringified_data)
        sum_data = np.vectorize(lambda x: sum([ord(char) for char in x]))(stringified_data)
        avg_data = sum_data / length_data
        std_data = np.vectorize(lambda x: np.array([ord(char) for char in x]).std())(stringified_data)
        float_data = np.vectorize(lambda x: 1 if '.' in x else 0)(stringified_data)
        time_data = np.vectorize(contains_time_characters)(stringified_data)
        vec = np.concatenate((length_data, sum_data, avg_data, std_data, float_data, time_data))
        
        vector_list.append(vec)
    
    prediction = estimator.predict(np.array(vector_list))
    prediction_count = Counter(prediction)        
    confidence = prediction_count.most_common(1)[0][1] / len(prediction)
    decode_dict = {0: 'numeric', 1: 'semantic categorical', 2: 'categorical', 3: 'time'}
        
    return decode_dict[round(prediction.mean())], confidence

In [63]:
def get_data_column_type_df(data, estimator, robustness=0.1, vector_dim=100):
    result_dict = {}
    
    if isinstance(data, pd.DataFrame):
        column_names = data.columns.values
        
        for i, colname in tqdm(list(enumerate(column_names))):
            datatype, confidence = get_data_column_type(data[colname], estimator, robustness=robustness)
            result_dict[colname] = datatype, confidence
    else:
        column_names = list(range(data.shape[1]))
        
        for i, colname in tqdm(list(enumerate(column_names))):
            datatype, confidence = get_data_column_type(data[:, colname], estimator, robustness=robustness)
            result_dict[colname] = datatype, confidence
    
    return result_dict

In [64]:
get_data_column_type_df(data_raw, ova_clf)

100%|██████████| 11/11 [00:04<00:00,  2.51it/s]


{'Age': ('numeric', 1.0),
 'Cabin': ('semantic categorical', 1.0),
 'Embarked': ('categorical', 1.0),
 'Fare': ('numeric', 1.0),
 'Name': ('semantic categorical', 1.0),
 'Parch': ('numeric', 1.0),
 'Pclass': ('categorical', 1.0),
 'Sex': ('semantic categorical', 1.0),
 'SibSp': ('numeric', 1.0),
 'Survived': ('categorical', 1.0),
 'Ticket': ('semantic categorical', 1.0)}

In [65]:
wine_raw = pd.DataFrame.from_csv('data_raw/winemag-data_first150k.csv')

In [66]:
wine_raw.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [67]:
get_data_column_type_df(wine_raw, ova_clf)

100%|██████████| 10/10 [00:04<00:00,  2.23it/s]


{'country': ('semantic categorical', 1.0),
 'description': ('semantic categorical', 1.0),
 'designation': ('semantic categorical', 1.0),
 'points': ('categorical', 1.0),
 'price': ('numeric', 1.0),
 'province': ('semantic categorical', 1.0),
 'region_1': ('semantic categorical', 1.0),
 'region_2': ('semantic categorical', 1.0),
 'variety': ('semantic categorical', 1.0),
 'winery': ('semantic categorical', 1.0)}

In [None]:
wine_raw['points'].value_counts()

In [16]:
wine_points = create_dataset_from_data_column(wine_raw['points'], 0)

100%|██████████| 5000/5000 [02:43<00:00, 50.32it/s] 


In [17]:
features, label = np.concatenate((features, wine_points[0]), axis=0), np.concatenate((label, wine_points[1])) 

In [18]:
features.shape, label.shape

((50000, 400), (50000,))

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=1113)

ova_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=128, n_jobs=-1, random_state=1113))
ova_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=1113, verbose=0,
            warm_start=False),
          n_jobs=1)

In [20]:
y_pred = ova_clf.predict(X_test)
ova_clf.score(X_train, y_train), ova_clf.score(X_test, y_test) 

(1.0, 0.99973333333333336)

In [21]:
get_data_column_type_df(wine_raw, ova_clf)

100%|██████████| 10/10 [00:33<00:00,  3.26s/it]


{'country': ('semantic categorical', 1.0),
 'description': ('semantic categorical', 1.0),
 'designation': ('semantic categorical', 1.0),
 'points': ('numeric', 1.0),
 'price': ('numeric', 1.0),
 'province': ('semantic categorical', 1.0),
 'region_1': ('semantic categorical', 1.0),
 'region_2': ('semantic categorical', 1.0),
 'variety': ('semantic categorical', 1.0),
 'winery': ('semantic categorical', 1.0)}

In [22]:
ted_raw = pd.DataFrame.from_csv('data_raw/ted_main.csv')

In [23]:
ted_raw.head()

Unnamed: 0_level_0,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
comments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


In [41]:
get_data_column_type_df(ted_raw, ova_clf)

100%|██████████| 16/16 [00:09<00:00,  1.77it/s]


{'description': ('semantic categorical', 1.0),
 'duration': ('numeric', 1.0),
 'event': ('semantic categorical', 1.0),
 'film_date': ('time', 1.0),
 'languages': ('categorical', 1.0),
 'main_speaker': ('semantic categorical', 1.0),
 'name': ('semantic categorical', 1.0),
 'num_speaker': ('numeric', 1.0),
 'published_date': ('time', 1.0),
 'ratings': ('semantic categorical', 1.0),
 'related_talks': ('semantic categorical', 1.0),
 'speaker_occupation': ('semantic categorical', 1.0),
 'tags': ('semantic categorical', 1.0),
 'title': ('semantic categorical', 1.0),
 'url': ('semantic categorical', 1.0),
 'views': ('numeric', 1.0)}

In [None]:
ted_raw['num_speaker'].value_counts()

In [26]:
film_date = create_dataset_from_data_column(ted_raw['film_date'], 3)
languages = create_dataset_from_data_column(ted_raw['languages'], 2)
num_speaker = create_dataset_from_data_column(ted_raw['num_speaker'], 0)
published_date = create_dataset_from_data_column(ted_raw['published_date'], 3)
views = create_dataset_from_data_column(ted_raw['views'], 0)

100%|██████████| 5000/5000 [01:12<00:00, 68.94it/s] 
100%|██████████| 5000/5000 [00:45<00:00, 110.63it/s]
100%|██████████| 5000/5000 [00:46<00:00, 107.20it/s]
100%|██████████| 5000/5000 [00:51<00:00, 97.32it/s] 
100%|██████████| 5000/5000 [00:53<00:00, 93.71it/s] 


In [27]:
features, label = np.concatenate((features, film_date[0], languages[0], num_speaker[0], published_date[0], views[0]), axis=0), np.concatenate((label, film_date[1], languages[1], num_speaker[1], published_date[1], views[1]))

In [28]:
features.shape, label.shape

((75000, 400), (75000,))

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=1113)

ova_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=128, n_jobs=-1, random_state=1113))
ova_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=1113, verbose=0,
            warm_start=False),
          n_jobs=1)

In [30]:
y_pred = ova_clf.predict(X_test)
ova_clf.score(X_train, y_train), ova_clf.score(X_test, y_test)

(1.0, 0.99986666666666668)

In [31]:
get_data_column_type_df(ted_raw, ova_clf)

100%|██████████| 16/16 [01:34<00:00,  6.03s/it]


{'description': ('semantic categorical', 1.0),
 'duration': ('numeric', 1.0),
 'event': ('semantic categorical', 1.0),
 'film_date': ('time', 1.0),
 'languages': ('categorical', 1.0),
 'main_speaker': ('semantic categorical', 1.0),
 'name': ('semantic categorical', 1.0),
 'num_speaker': ('numeric', 1.0),
 'published_date': ('time', 1.0),
 'ratings': ('semantic categorical', 1.0),
 'related_talks': ('semantic categorical', 1.0),
 'speaker_occupation': ('semantic categorical', 1.0),
 'tags': ('semantic categorical', 1.0),
 'title': ('semantic categorical', 1.0),
 'url': ('semantic categorical', 1.0),
 'views': ('numeric', 1.0)}

In [33]:
ufo_raw = pd.read_csv('data_raw/ufo-sightings.csv', error_bad_lines=False)

b'Skipping line 878: expected 11 fields, saw 12\nSkipping line 1713: expected 11 fields, saw 12\nSkipping line 1815: expected 11 fields, saw 12\nSkipping line 2858: expected 11 fields, saw 12\nSkipping line 3734: expected 11 fields, saw 12\nSkipping line 4756: expected 11 fields, saw 12\nSkipping line 5389: expected 11 fields, saw 12\nSkipping line 5423: expected 11 fields, saw 12\nSkipping line 5614: expected 11 fields, saw 12\nSkipping line 5849: expected 11 fields, saw 12\nSkipping line 6093: expected 11 fields, saw 12\nSkipping line 7516: expected 11 fields, saw 12\nSkipping line 7626: expected 11 fields, saw 12\nSkipping line 8893: expected 11 fields, saw 12\nSkipping line 9015: expected 11 fields, saw 12\nSkipping line 9571: expected 11 fields, saw 12\nSkipping line 9620: expected 11 fields, saw 12\nSkipping line 9751: expected 11 fields, saw 12\nSkipping line 10157: expected 11 fields, saw 12\nSkipping line 10427: expected 11 fields, saw 12\nSkipping line 12035: expected 11 fiel

In [36]:
ufo_raw.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [42]:
get_data_column_type_df(ufo_raw, ova_clf)

100%|██████████| 11/11 [00:06<00:00,  1.63it/s]


{'city': ('semantic categorical', 1.0),
 'comments': ('semantic categorical', 1.0),
 'country': ('numeric', 1.0),
 'date posted': ('time', 1.0),
 'datetime': ('time', 0.8),
 'duration (hours/min)': ('semantic categorical', 1.0),
 'duration (seconds)': ('categorical', 1.0),
 'latitude': ('time', 1.0),
 'longitude': ('time', 1.0),
 'shape': ('semantic categorical', 1.0),
 'state': ('semantic categorical', 1.0)}