# Kaggle Competition 
## Rebekah Griesenauer

### Run useful functions

In [1]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64

# TensorFlow with Dropout for Regression
############################################
%matplotlib inline
from matplotlib.pyplot import figure, show
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy.stats import zscore
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.models import Sequential


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
        
# This function submits an assignment.  You can submit an assignment as much as you like, only the final
# submission counts.  The paramaters are as follows:
# data - Pandas dataframe output.
# key - Your student key that was emailed to you.
# no - The assignment class number, should be 1 through 1.
# source_file - The full path to your Python or IPYNB file.  This must have "_class1" as part of its name.  
# .             The number must match your assignment number.  For example "_class2" for class assignment #2.
def submit(data,key,no,source_file=None):
    if source_file is None and '__file__' not in globals(): raise Exception('Must specify a filename when a Jupyter notebook.')
    if source_file is None: source_file = __file__
    suffix = '_class{}'.format(no)
    if suffix not in source_file: raise Exception('{} must be part of the filename.'.format(suffix))
    with open(source_file, "rb") as image_file:
        encoded_python = base64.b64encode(image_file.read()).decode('ascii')
    ext = os.path.splitext(source_file)[-1].lower()
    if ext not in ['.ipynb','.py']: raise Exception("Source file is {} must be .py or .ipynb".format(ext))
    r = requests.post("https://api.heatonresearch.com/assignment-submit",
        headers={'x-api-key':key}, json={'csv':base64.b64encode(data.to_csv(index=False).encode('ascii')).decode("ascii"),
        'assignment': no, 'ext':ext, 'py':encoded_python})
    if r.status_code == 200:
        print("Success: {}".format(r.text))
    else: print("Failure: {}".format(r.text))

Using TensorFlow backend.


In [2]:
# Setup path and read in data
path = "./data/all/"

filename_read_train = os.path.join(path,"train.csv")
filename_read_test = os.path.join(path,"test.csv")
df_train = pd.read_csv(filename_read_train,na_values=['NA', '?'])
df_test = pd.read_csv(filename_read_test,na_values=['NA', '?'])

# Set the desired TensorFlow output level for this example
tf.logging.set_verbosity(tf.logging.ERROR)

ids = df_test['id']
df_test.drop('id',1,inplace=True)
df_train.drop('id',1,inplace=True)

df_train = df_train.reindex(np.random.permutation(df_train.index))
df_train.reset_index(inplace=True, drop=True)

density_gold = 19.32
density_platinum = 21.09
density_bronze = 9.29
density_tin =  7.31
density_silver = 10.49

# Feature Engineering
1. ID: Drop
2. shape: Only has 3 shapes - encode as dummy variable
3. metal: All elemental metals - get weight, specific gravity from periodic table
4. metal_cost: may or may not be useful
5. height: can use to calculate volume
6. width: can use to calculate volume
7. length: can use to calculate volume
8. led: 
9. gears:
10. motors:
11. led_vol: probably not that useful to feed into NN - use to calculate the volume of one LED - remove metal vol
12. motor_vol: probably not that useful to feed into NN - use to calculate the volume of one motor - remove metal vol
13. gear_vol: probably not that useful to feed into NN - use to calculate the volume of one gear - remove metal vol
14. volume_parts: 
15. cost: fill missing values with median
16. weight (target):


Add a column that is volume,
estimate weight (volume * density),
estimate volume of led, motor, and gears



In [3]:
#calculate and populate volume for each different shape 
def feature_engineering_calculations(df):

    df['volume']=0
    df['volume'] = df.apply(
        lambda row: row['length']*row['height']*row['width'] if row['shape']=='box' else row['volume'],axis=1)

    df['volume'] = df.apply(
        lambda row: row['height']*np.pi*(row['width']/2)**2 if row['shape']=='cylinder' else row['volume'],axis=1)

    df['volume'] = df.apply(
        lambda row: (4/3)*np.pi*(row['length']/2)**3 if row['shape']=='sphere' else row['volume'],axis=1)
   
    df['est_weight']=0
    df['est_weight'] = df.apply(
        lambda row: density_gold*row['volume'] if row['metal']=='gold' else row['est_weight'],axis=1)

    df['est_weight'] = df.apply(
        lambda row: density_platinum*row['volume'] if row['metal']=='platinum' else row['est_weight'],axis=1)

    df['est_weight'] = df.apply(
        lambda row: density_bronze*row['volume'] if row['metal']=='bronze' else row['est_weight'],axis=1)

    df['est_weight'] = df.apply(
        lambda row: density_tin*row['volume'] if row['metal']=='tin' else row['est_weight'],axis=1)

    df['est_weight'] = df.apply(
        lambda row: density_silver*row['volume'] if row['metal']=='silver' else row['est_weight'],axis=1)

    df['led_vol'] = df['led']*0.027
    
    missing_median(df,'cost')
    df['price_per_metal'] = df['cost']/df['metal_cost']
    
    df['motor_vol'] = (2*2*2) * df['motors']
    df['gear_vol'] = (1*2*2) * df['gears']
    df['volume_parts'] = df['led_vol'] + df['motor_vol'] + df['gear_vol']
    
    df['final_volume'] = df['volume']-df['volume_parts']
    return df
    

In [11]:
def feature_engineering_encode(df):
    encode_text_dummy(df,"shape")
    encode_text_dummy(df,"metal")
    df_train.drop('metal_cost',1,inplace=True)
    df_train.drop('height',1,inplace=True)
    df_train.drop('width',1,inplace=True)
    df_train.drop('length',1,inplace=True)
    return df

In [5]:
df_train = feature_engineering_calculations(df_train)
df_train = feature_engineering_encode(df_train)

In [6]:
df_train['weight']=df_train['weight'].astype('float')
df_train.dtypes

led                  int64
gears                int64
motors               int64
led_vol            float64
motor_vol            int64
gear_vol             int64
volume_parts       float64
cost               float64
weight             float64
volume             float64
est_weight         float64
price_per_metal    float64
final_volume       float64
shape-box            uint8
shape-cylinder       uint8
shape-sphere         uint8
metal-bronze         uint8
metal-gold           uint8
metal-platinum       uint8
metal-silver         uint8
metal-tin            uint8
dtype: object

In [7]:
x,y = to_xy(df_train,"weight")
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(    
    x, y, test_size=0.20, random_state=45)



In [8]:
model = Sequential()
model.add(Dense(100, input_dim=x.shape[1]))
model.add(Dropout(0.01))
model.add(Dense(50, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=1, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
pred = model.predict(x_test)
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Epoch 00036: early stopping
Final score (RMSE): 211.93797302246094


In [10]:
df_test = feature_engineering_calculations(df_test)
df_test = feature_engineering_encode(df_test)
x = df_test.as_matrix().astype(np.float32)

KeyError: ('shape', 'occurred at index 0')

In [12]:
pred = model.predict(x)

In [13]:
submit_df=pd.DataFrame(pred)
submit_df.insert(0,'id',ids)
submit_df.columns = ['id','weight']
submit_df=submit_df.set_index('id')
submit_df.to_csv('kaggle_submit_df2.csv')

In [14]:
submit_df

Unnamed: 0_level_0,weight
id,Unnamed: 1_level_1
0.0,8644.963867
1.0,2385.978271
2.0,1091.664551
3.0,939.879700
4.0,947.205139
5.0,7922.657715
6.0,3113.919434
7.0,1145.597900
8.0,4774.109863
9.0,1797.096924
