In [3]:
import numpy as np
import pandas as pd

detect_type = {'numeric': 0, 'num_cat': 1, 'str_cat': 2, 'time': 3}

In [16]:
import numpy as np
from tqdm import tqdm

def create_dataset_from_data_column(iterable, label, vector_dim=100, num_rows=2000):
    iterable = np.array(iterable)    
    choice_range = len(iterable)
    iterable_str = iterable.astype(str)
    
    def contains_time_characters(string):
        time_chars = {':', '/', 
                     'hr', 'hour', 'min', 'minute', 'sec', 'second',
                     'day', 'week', 'year'}
        for char in time_chars:
            if char in string:
                return 1
        return 0
    
    vector_list = []
    for i in tqdm(list(range(num_rows))):
        indices = np.random.choice(choice_range, vector_dim)
        stringified_data = iterable_str[indices]
        
        length_data = np.vectorize(len)(stringified_data)
        sum_data = np.vectorize(lambda x: sum([ord(char) for char in x]))(stringified_data)
        avg_data = sum_data / length_data
        std_data = np.vectorize(lambda x: np.array([ord(char) for char in x]).std())(stringified_data)
        float_data = np.vectorize(lambda x: 1 if '.' in x else 0)(stringified_data)
        time_data = np.vectorize(contains_time_characters)(stringified_data)
        vec = np.concatenate((length_data, sum_data, avg_data, std_data, float_data, time_data))
        vector_list.append(vec)
        
    return np.array(vector_list), np.array([label] * num_rows)

# Titanic dataset

In [17]:
titanic_data_raw = pd.DataFrame.from_csv('../data/titanic/raw/train.csv')

In [18]:
titanic_data_raw.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
# detect_type = {'numeric': 0, 'num_cat': 1, 'str_cat': 2, 'time': 3}

titanic_data = [
    create_dataset_from_data_column(titanic_data_raw['Survived'], 1),
    create_dataset_from_data_column(titanic_data_raw['Pclass'], 1),
    create_dataset_from_data_column(titanic_data_raw['Name'], 2),
    create_dataset_from_data_column(titanic_data_raw['Sex'], 2),
    create_dataset_from_data_column(titanic_data_raw['Age'], 0),
    create_dataset_from_data_column(titanic_data_raw['SibSp'], 0),
    create_dataset_from_data_column(titanic_data_raw['Parch'], 0),
    create_dataset_from_data_column(titanic_data_raw['Ticket'], 2),
    create_dataset_from_data_column(titanic_data_raw['Fare'], 0),
    create_dataset_from_data_column(titanic_data_raw['Cabin'], 2),
    create_dataset_from_data_column(titanic_data_raw['Embarked'], 2)
]

100%|██████████| 2000/2000 [00:10<00:00, 190.97it/s]
100%|██████████| 2000/2000 [00:14<00:00, 135.01it/s]
100%|██████████| 2000/2000 [00:21<00:00, 93.37it/s] 
100%|██████████| 2000/2000 [00:12<00:00, 156.43it/s]
100%|██████████| 2000/2000 [00:12<00:00, 156.17it/s]
100%|██████████| 2000/2000 [00:14<00:00, 134.11it/s]
100%|██████████| 2000/2000 [00:15<00:00, 131.38it/s]
100%|██████████| 2000/2000 [00:15<00:00, 128.40it/s]
100%|██████████| 2000/2000 [00:08<00:00, 237.05it/s]
100%|██████████| 2000/2000 [00:20<00:00, 98.72it/s] 
100%|██████████| 2000/2000 [00:09<00:00, 211.50it/s]


In [28]:
features, label = np.concatenate([X for X, y in titanic_data], axis=0), np.concatenate([y for X, y in titanic_data], axis=0)

In [29]:
features.shape, label.shape

((22000, 600), (22000,))

In [31]:
import pickle

with open('../data/auto_datatyper/features.pkl', 'wb') as handle:
    pickle.dump(features, handle)
with open('../data/auto_datatyper/label.pkl', 'wb') as handle:
    pickle.dump(label, handle)

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=1113)

ova_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=128, n_jobs=-1, random_state=1113))
ova_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=1113, verbose=0,
            warm_start=False),
          n_jobs=1)

In [33]:
ova_clf.score(X_train, y_train), ova_clf.score(X_test, y_test) 

(1.0, 0.99984848484848488)

In [47]:
from collections import Counter

def get_data_column_type(iterable, estimator, robustness=0.1, vector_dim=100):        
    iterable = np.array(iterable)
    choice_range = len(iterable)
    iterable_str = iterable.astype(str)
        
    def contains_time_characters(string):
        time_chars = {':', '/', 
                     'hr', 'hour', 'min', 'minute', 'sec', 'second',
                     'day', 'week', 'year'}
        for char in time_chars:
            if char in string:
                return 1
        return 0
    
    vector_list = []
    for i in (range(int(100 * robustness))):
        indices = np.random.choice(choice_range, vector_dim)
        stringified_data = iterable_str[indices]
        
        length_data = np.vectorize(len)(stringified_data)
        sum_data = np.vectorize(lambda x: sum([ord(char) for char in x]))(stringified_data)
        avg_data = sum_data / length_data
        std_data = np.vectorize(lambda x: np.array([ord(char) for char in x]).std())(stringified_data)
        float_data = np.vectorize(lambda x: 1 if '.' in x else 0)(stringified_data)
        time_data = np.vectorize(contains_time_characters)(stringified_data)
        vec = np.concatenate((length_data, sum_data, avg_data, std_data, float_data, time_data))
        
        vector_list.append(vec)
    
    prediction = estimator.predict(np.array(vector_list))
    prediction_count = Counter(prediction)        
    confidence = prediction_count.most_common(1)[0][1] / len(prediction)

    decode_dict = {0: 'numeric', 1: 'num_cat', 2: 'str_cat', 3: 'time'}
        
    return decode_dict[round(prediction.mean())], confidence

In [48]:
def get_data_column_type_df(data, estimator, robustness=0.1, vector_dim=100):
    result_dict = {}
    
    if isinstance(data, pd.DataFrame):
        column_names = data.columns.values
        
        for i, colname in tqdm(list(enumerate(column_names))):
            datatype, confidence = get_data_column_type(data[colname], estimator, robustness=robustness)
            result_dict[colname] = datatype, confidence
    else:
        column_names = list(range(data.shape[1]))
        
        for i, colname in tqdm(list(enumerate(column_names))):
            datatype, confidence = get_data_column_type(data[:, colname], estimator, robustness=robustness)
            result_dict[colname] = datatype, confidence
    
    return result_dict

In [49]:
get_data_column_type_df(titanic_data_raw, ova_clf)

100%|██████████| 11/11 [00:04<00:00,  2.68it/s]


{'Age': ('numeric', 1.0),
 'Cabin': ('str_cat', 1.0),
 'Embarked': ('str_cat', 1.0),
 'Fare': ('numeric', 1.0),
 'Name': ('str_cat', 1.0),
 'Parch': ('numeric', 1.0),
 'Pclass': ('num_cat', 1.0),
 'Sex': ('str_cat', 1.0),
 'SibSp': ('numeric', 1.0),
 'Survived': ('num_cat', 1.0),
 'Ticket': ('str_cat', 1.0)}

# Wine Reviews dataset

In [41]:
wine_data_raw = pd.DataFrame.from_csv('../data/wine_reviews/winemag-data_first150k.csv')

In [42]:
wine_data_raw.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [50]:
get_data_column_type_df(wine_data_raw, ova_clf)

100%|██████████| 10/10 [00:04<00:00,  2.50it/s]


{'country': ('str_cat', 1.0),
 'description': ('str_cat', 1.0),
 'designation': ('str_cat', 1.0),
 'points': ('str_cat', 1.0),
 'price': ('numeric', 1.0),
 'province': ('str_cat', 1.0),
 'region_1': ('str_cat', 1.0),
 'region_2': ('str_cat', 1.0),
 'variety': ('str_cat', 1.0),
 'winery': ('str_cat', 1.0)}

In [54]:
wine_points = create_dataset_from_data_column(wine_data_raw['points'], 0)

100%|██████████| 2000/2000 [00:07<00:00, 267.56it/s]


In [55]:
features, label = np.concatenate((features, wine_points[0]), axis=0), np.concatenate((label, wine_points[1]), axis=0) 

In [56]:
features.shape, label.shape

((24000, 600), (24000,))

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=1113)

ova_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=128, n_jobs=-1, random_state=1113))
ova_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=1113, verbose=0,
            warm_start=False),
          n_jobs=1)

In [58]:
y_pred = ova_clf.predict(X_test)
ova_clf.score(X_train, y_train), ova_clf.score(X_test, y_test) 

(1.0, 0.99986111111111109)

In [60]:
wine_data_raw.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [59]:
get_data_column_type_df(wine_data_raw, ova_clf)

100%|██████████| 10/10 [00:04<00:00,  2.46it/s]


{'country': ('str_cat', 1.0),
 'description': ('str_cat', 1.0),
 'designation': ('str_cat', 1.0),
 'points': ('numeric', 1.0),
 'price': ('numeric', 1.0),
 'province': ('str_cat', 1.0),
 'region_1': ('str_cat', 1.0),
 'region_2': ('str_cat', 1.0),
 'variety': ('str_cat', 1.0),
 'winery': ('str_cat', 1.0)}

In [61]:
ted_data_raw = pd.DataFrame.from_csv('../data/ted/ted_main.csv')

In [62]:
ted_data_raw.head()

Unnamed: 0_level_0,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
comments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


In [63]:
get_data_column_type_df(ted_data_raw, ova_clf)

100%|██████████| 16/16 [00:06<00:00,  2.55it/s]


{'description': ('str_cat', 1.0),
 'duration': ('numeric', 1.0),
 'event': ('str_cat', 1.0),
 'film_date': ('str_cat', 1.0),
 'languages': ('numeric', 1.0),
 'main_speaker': ('str_cat', 1.0),
 'name': ('str_cat', 1.0),
 'num_speaker': ('num_cat', 1.0),
 'published_date': ('str_cat', 1.0),
 'ratings': ('str_cat', 1.0),
 'related_talks': ('str_cat', 1.0),
 'speaker_occupation': ('str_cat', 1.0),
 'tags': ('str_cat', 1.0),
 'title': ('str_cat', 1.0),
 'url': ('str_cat', 1.0),
 'views': ('str_cat', 1.0)}

In [65]:
# detect_type = {'numeric': 0, 'num_cat': 1, 'str_cat': 2, 'time': 3}

film_date = create_dataset_from_data_column(ted_data_raw['film_date'], 3)
languages = create_dataset_from_data_column(ted_data_raw['languages'], 0)
num_speaker = create_dataset_from_data_column(ted_data_raw['num_speaker'], 0)
published_date = create_dataset_from_data_column(ted_data_raw['published_date'], 3)
views = create_dataset_from_data_column(ted_data_raw['views'], 0)

100%|██████████| 2000/2000 [00:10<00:00, 197.79it/s]
100%|██████████| 2000/2000 [00:10<00:00, 182.77it/s]
100%|██████████| 2000/2000 [00:12<00:00, 165.19it/s]
100%|██████████| 2000/2000 [00:11<00:00, 169.15it/s]
100%|██████████| 2000/2000 [00:10<00:00, 190.44it/s]


In [68]:
features, label = np.concatenate((features, film_date[0], languages[0], num_speaker[0], published_date[0], views[0]), axis=0), np.concatenate((label, film_date[1], languages[1], num_speaker[1], published_date[1], views[1]), axis=0)

In [69]:
features.shape, label.shape

((34000, 600), (34000,))

In [70]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=1113)

ova_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=128, n_jobs=-1, random_state=1113))
ova_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=1113, verbose=0,
            warm_start=False),
          n_jobs=1)

In [71]:
y_pred = ova_clf.predict(X_test)
ova_clf.score(X_train, y_train), ova_clf.score(X_test, y_test)

(1.0, 0.9999019607843137)

In [73]:
ted_data_raw.head()

Unnamed: 0_level_0,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
comments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


In [72]:
get_data_column_type_df(ted_data_raw, ova_clf)

100%|██████████| 16/16 [00:08<00:00,  1.87it/s]


{'description': ('str_cat', 1.0),
 'duration': ('numeric', 1.0),
 'event': ('str_cat', 1.0),
 'film_date': ('time', 1.0),
 'languages': ('numeric', 1.0),
 'main_speaker': ('str_cat', 1.0),
 'name': ('str_cat', 1.0),
 'num_speaker': ('numeric', 1.0),
 'published_date': ('time', 1.0),
 'ratings': ('str_cat', 1.0),
 'related_talks': ('str_cat', 1.0),
 'speaker_occupation': ('str_cat', 1.0),
 'tags': ('str_cat', 1.0),
 'title': ('str_cat', 1.0),
 'url': ('str_cat', 1.0),
 'views': ('numeric', 1.0)}

# UFO Sightings dataset

In [77]:
ufo_data_raw = pd.read_csv('../data/ufo_sightings/ufo-sightings.csv', error_bad_lines=False, warn_bad_lines=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [78]:
ufo_data_raw.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [79]:
get_data_column_type_df(ufo_data_raw, ova_clf)

100%|██████████| 11/11 [00:05<00:00,  1.96it/s]


{'city': ('str_cat', 1.0),
 'comments': ('str_cat', 1.0),
 'country': ('str_cat', 1.0),
 'date posted': ('str_cat', 1.0),
 'datetime': ('str_cat', 1.0),
 'duration (hours/min)': ('str_cat', 1.0),
 'duration (seconds)': ('numeric', 1.0),
 'latitude': ('time', 1.0),
 'longitude': ('time', 1.0),
 'shape': ('str_cat', 1.0),
 'state': ('str_cat', 1.0)}

In [80]:
detect_type = {'numeric': 0, 'num_cat': 1, 'str_cat': 2, 'time': 3}

ufo_data = [
    create_dataset_from_data_column(ufo_data_raw['datetime'], 3),
    create_dataset_from_data_column(ufo_data_raw['date posted'], 3),
    create_dataset_from_data_column(ufo_data_raw['latitude'], 0),
    create_dataset_from_data_column(ufo_data_raw['longitude'], 0)
]

100%|██████████| 2000/2000 [00:13<00:00, 151.22it/s]
100%|██████████| 2000/2000 [00:13<00:00, 151.17it/s]
100%|██████████| 2000/2000 [00:13<00:00, 150.34it/s]
100%|██████████| 2000/2000 [00:15<00:00, 131.10it/s]


In [81]:
features, label = np.concatenate((features, *[X for X, y in ufo_data]), axis=0), np.concatenate((label, *[y for X, y in ufo_data]), axis=0)

In [82]:
features.shape, label.shape

((42000, 600), (42000,))

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=1113)

ova_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=128, n_jobs=-1, random_state=1113))
ova_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=1113, verbose=0,
            warm_start=False),
          n_jobs=1)

In [84]:
y_pred = ova_clf.predict(X_test)
ova_clf.score(X_train, y_train), ova_clf.score(X_test, y_test)

(1.0, 0.99976190476190474)

In [85]:
ufo_data_raw.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [86]:
get_data_column_type_df(ufo_data_raw, ova_clf)

100%|██████████| 11/11 [00:05<00:00,  1.95it/s]


{'city': ('str_cat', 1.0),
 'comments': ('str_cat', 1.0),
 'country': ('str_cat', 1.0),
 'date posted': ('time', 1.0),
 'datetime': ('time', 1.0),
 'duration (hours/min)': ('str_cat', 1.0),
 'duration (seconds)': ('numeric', 1.0),
 'latitude': ('numeric', 1.0),
 'longitude': ('numeric', 1.0),
 'shape': ('str_cat', 1.0),
 'state': ('str_cat', 1.0)}