In [5]:
#! pip install keras
#! pip install tensorflow

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/ed/11/037887c5cbac5af3124050fb6348e67caa038734cc9673b11c31c8939072/tensorflow-1.14.0-cp37-cp37m-macosx_10_11_x86_64.whl (105.8MB)
[K     |████████████████████████████████| 105.8MB 21.1MB/s eta 0:00:01
[?25hCollecting wrapt>=1.11.1 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/23/84/323c2415280bc4fc880ac5050dddfb3c8062c2552b34c2e512eb4aa68f79/wrapt-1.11.2.tar.gz
Collecting tensorboard<1.15.0,>=1.14.0 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.2MB 39.6MB/s eta 0:00:01
Collecting google-pasta>=0.1.6 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/d0/33/376510eb8d6246f3c30545f416b2263eee461e40940c2a4413c711bdf62d/google_pasta-0.1.7-py3-none-any.whl (52kB)
[K     |

In [159]:
import requests 
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras import Sequential
from keras.layers import Dense
#from keras.wrappers.scikit_learn import KerasClassifier
#from sklearn.model_selection import KFold
from keras.utils import to_categorical

## Get train dataset

In [160]:
def get_data(url):
    '''Get the datasets from the Driven Data website by specific URLs and return a dataframe.'''
    
    html = requests.get(url)
    text = html.text.splitlines()
    
    # split each line into the columns, seperated by commas
    split_text = []
    for i in text:
        split_text.append(i.split(','))
     
    # get column names from first line
    col_names = split_text[0]
    
    value_dict = dict()
    # for each line add number of line as key and data as values to value_dictionary
    for i in range(1,len(split_text)):
        value_dict[i] = split_text[i]
    
    # create dataframe using column names and dictionary of data
    df = pd.DataFrame.from_dict(value_dict, orient='index', columns = col_names)
    
    # convert dtype into int or category
    pattern = re.compile(r'_id$|count_|age|_percentage$|has_|_grade$')
    for i in df.columns:
        if re.search(pattern, i):
            df[i] = df[i].astype(int)
        else:
            df[i] = df[i].astype('category')
    
    return df

**Hot-Encoding**

In [177]:
def get_data_dict(url):
    '''Get the train dataset from the Driven Data website by specific URLs and return a dictionary.'''
    
    html = requests.get(url)
    text = html.text.splitlines()
    
    # split each line into the columns, seperated by commas
    split_text = []
    for i in text:
        split_text.append(i.split(','))
     
    # get column names from first line
    col_names = split_text[0]
    
    data = []
    # for each line add number of line as key and data as values to value_dictionary
    for i in range(1,len(split_text)):
        value_dict = dict()
        for j in range(1,len(col_names)):
            value_dict[col_names[j]] = split_text[i][j]
        data.append(value_dict)
    
    
    # convert dtype into int or category
    pattern = re.compile(r'[0-9]+')
    for i in data:
        for key,val in i.items():
            if re.search(pattern, val):
                i[key] = int(val)
    
     # get building id
    build_id = []
    for i in range(1,len(split_text)):
        build_id.append(int(split_text[i][0]))

    df = pd.DataFrame(build_id,columns=['building_id'])

    
    return data,df

In [192]:
# create dataframe of train values
train_X,building_ID = get_data_dict("https://s3.amazonaws.com/drivendata/data/57/public/train_values.csv")
y_labels = get_data("https://s3.amazonaws.com/drivendata/data/57/public/train_labels.csv")

In [181]:
vec = DictVectorizer(sparse=False,dtype=int)
X = vec.fit_transform(train_X)
X

array([[30,  6,  1, ...,  1,  0,  0],
       [10,  8,  1, ...,  1,  0,  0],
       [10,  5,  1, ...,  1,  0,  0],
       ...,
       [55,  6,  1, ...,  0,  1,  0],
       [10, 14,  1, ...,  0,  0,  1],
       [10,  7,  3, ...,  1,  0,  0]], dtype=int64)

In [162]:
# create dummy variables
train_labels = pd.get_dummies(train_labels, drop_first=True)
train_labels.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
1,802906,6,487,12198,2,30,6,5,1,1,...,0,0,0,0,0,0,0,0,1,0
2,28830,8,900,2812,2,10,8,7,0,1,...,0,0,0,0,0,0,0,0,1,0
3,94947,21,363,8973,2,10,5,5,0,1,...,0,0,0,0,0,0,0,0,1,0
4,590882,22,418,10694,2,10,6,5,0,1,...,0,0,0,0,0,0,0,0,1,0
5,201944,11,131,1488,3,30,8,9,1,0,...,0,0,0,0,0,0,0,0,1,0


**Train_test split the train dataset**

In [182]:
# just get the damage grade column for y
y_labels = y_labels[['damage_grade']]
# practice training models by splitting the train data
X_train,X_test,y_train,y_test = train_test_split(X,y_labels, test_size=0.2, random_state=2)
y_train.shape

(208480, 1)

In [183]:
train_labels = [{'damage_grade':i} for i in y_train.damage_grade]
vec_y = DictVectorizer(sparse=False,dtype=int)
y_train = vec_y.fit_transform(train_labels)

In [184]:
# y_train = to_categorical(y_train,3)
y_train.shape

(208480, 1)

In [185]:
def to_one_hot(labels, dimension=3):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label-1] = 1.
    return results

# Our vectorized training labels
one_hot_train_labels = to_one_hot(y_train)

## Build the neural network

In [189]:
# baseline model
model = Sequential()
model.add(Dense(120, input_dim=68, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [190]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

In [191]:
model.fit(X, one_hot_train_labels)

Epoch 1/1


<keras.callbacks.History at 0x155b81240>

In [194]:
y_labels = y_labels[['damage_grade']]
train_labels = [{'damage_grade':i} for i in y_labels.damage_grade]
vec_y = DictVectorizer(sparse=False,dtype=int)
y_train = vec_y.fit_transform(train_labels)

In [195]:
def to_one_hot(labels, dimension=3):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label-1] = 1.
    return results

# Our vectorized training labels
one_hot_train_labels = to_one_hot(y_train)

In [196]:
# baseline model
model = Sequential()
model.add(Dense(120, input_dim=68, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(X, one_hot_train_labels)

Epoch 1/1


<keras.callbacks.History at 0x15d56bdd8>

In [197]:
test_X,building_id = get_data_dict("https://s3.amazonaws.com/drivendata/data/57/public/test_values.csv")
vec_test = DictVectorizer(sparse=False,dtype=int)
X_test = vec_test.fit_transform(test_X)

In [198]:
predictions = model.predict(X_test)

In [200]:
def from_one_hot(labels, dimension=3):
    #
    for i enumerate(labels):
        print(labels)
    return labels

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [199]:
# create the submission format with building_id and predicted damage_grade
submission_df = building_id
submission_df['damage_grade'] = predictions
submission_df.to_csv('submission_kerasHOTENCODING.csv',index=False)

ValueError: Wrong number of items passed 3, placement implies 1