In [None]:
# Settings for this notebook

MODEL_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
model_dir = '/tmp/imagenet'

IMAGES_INDEX_URL = 'http://image-net.org/imagenet_data/urls/imagenet_fall11_urls.tgz'
images_read_limit = 1000L  # Increase this to read more images

# Number of images per batch.
# 1 batch corresponds to 1 RDD row.
image_batch_size = 3

num_top_predictions = 5

In [None]:
import numpy as np

import os
import os.path
import re
import sys
import tarfile
from subprocess import Popen, PIPE, STDOUT
try:
    import tensorflow as tf
    print "TensorFlow is already installed"
except ImportError:
    print "Installing TensorFlow"
    import subprocess
    subprocess.check_call(["/databricks/python/bin/pip", "install", "https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.1-cp27-none-linux_x86_64.whl"])
    from tensorflow.python.platform import gfile
    import tensorflow as tf
    print "TensorFlow has been installed on this cluster"

In [None]:
def maybe_download_and_extract():
    from six.moves import urllib
    dest_directory = model_dir
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    filename = MODEL_URL.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):
        filepath2, _ = urllib.request.urlretrieve(MODEL_URL, filepath)
        print("filepath2", filepath2)
        statinfo = os.stat(filepath)
        print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
        tarfile.open(filepath, 'r:gz').extractall(dest_directory)
    else:
        print('Data already downloaded:', filepath, os.stat(filepath))

maybe_download_and_extract()

In [None]:
model_path = os.path.join(model_dir, 'classify_image_graph_def.pb')
with tf.python.platform.gfile.FastGFile(model_path, 'rb') as f:
    model_data = f.read()

In [None]:
import math
import numpy
import random
import types
from itertools import izip, tee, imap
from operator import itemgetter
from cStringIO import StringIO
import tensorflow as tf

In [None]:
class ConvNet():
    IMAGE_PIXELS = 784
    NUM_CLASSES = 10
    def __init__(self):
        self.build(train=False)

    def build(self, keep_prob=1.0, lr = 1e-4):
        self.x = tf.placeholder(tf.float32, [None, self.IMAGE_PIXELS], name='x-input')
        self.y_ = tf.placeholder(tf.float32, [None, self.NUM_CLASSES], name='y-input')

        def weight_variable(shape):
            return tf.Variable(tf.truncated_normal(shape, stddev=0.1))
        def bias_variable(shape):
            return tf.Variable(tf.constant(0.1, shape=shape))
        def conv2d(x, W):
            return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
        def max_pool_2x2(x):
            return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

        W_conv1 = weight_variable([5, 5, 1, 32])
        b_conv1 = bias_variable([32])
        x_image = tf.reshape(self.x, [-1,28,28,1])
        h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
        h_pool1 = max_pool_2x2(h_conv1)

        W_conv2 = weight_variable([5, 5, 32, 64])
        b_conv2 = bias_variable([64])
        h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
        h_pool2 = max_pool_2x2(h_conv2)

        W_fc1 = weight_variable([7 * 7 * 64, 1024])
        b_fc1 = bias_variable([1024])
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

        W_fc2 = weight_variable([1024, self.NUM_CLASSES])
        b_fc2 = bias_variable([self.NUM_CLASSES])

        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
        y = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

        y = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)

        # training
        self.loss = -tf.reduce_sum(self.y_*tf.log(y))
        self.train = tf.train.AdamOptimizer(lr).minimize(self.loss)

        # evaluation
        self.evaluate = tf.cast(tf.equal(tf.argmax(y,1), tf.argmax(self.y_,1)), tf.float32)

    def transform_batch(self, batch):
        x, y_ = iunzip(batch)
        return {
          self.x: list(x),
          self.y_: list(y_)
        }

class ConvNetEval(ConvNet):
    def __init__(self):
        self.build(keep_prob = 1.)

class ConvNetTrain(ConvNet):
    def __init__(self):
        self.build(
          keep_prob = .5,
          lr = 1e-4)

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

partitions = 8

train_data = sc\
  .parallelize(zip(mnist.train.images, mnist.train.labels), partitions)\
  .cache()     
test_data = sc\
  .parallelize(zip(mnist.test.images, mnist.test.labels), partitions)\
  .cache()

In [None]:
master_graph = tf.Graph()
with master_graph.as_default(), tf.Session() as session:
    dg = DistributedGraph(sc, session, partitions, 50, ConvNetEval)
      #dg.params = last_params
    for i in xrange(4):
        batched_train_data = dg.shuffle_and_batch(train_data, dg.partitions, dg.batch_size, 0.5)
        x = dg.train(batched_train_data, worker_epochs=2, graph_cls=ConvNetTrain)
        print 'Training loss: %.2f' % x
        if i == 0 or ((i+1) % 2 == 0):
            print 'Epoch %i Test Acc: %.3f' % (i, dg.evaluate(test_data, graph_cls=ConvNetEval))

In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'


In [1]:
import math
import numpy
import random
import types
from itertools import izip, tee, imap
from operator import itemgetter
from cStringIO import StringIO
import tensorflow as tf

In [2]:
class ConvNet():
    IMAGE_PIXELS = 784
    NUM_CLASSES = 10
    def __init__(self):
        self.build(train=False)

    def build(self, keep_prob=1.0, lr = 1e-4):
        self.x = tf.placeholder(tf.float32, [None, self.IMAGE_PIXELS], name='x-input')
        self.y_ = tf.placeholder(tf.float32, [None, self.NUM_CLASSES], name='y-input')

        def weight_variable(shape):
            return tf.Variable(tf.truncated_normal(shape, stddev=0.1))
        def bias_variable(shape):
            return tf.Variable(tf.constant(0.1, shape=shape))
        def conv2d(x, W):
            return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
        def max_pool_2x2(x):
            return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

        W_conv1 = weight_variable([5, 5, 1, 32])
        b_conv1 = bias_variable([32])
        x_image = tf.reshape(self.x, [-1,28,28,1])
        h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
        h_pool1 = max_pool_2x2(h_conv1)

        W_conv2 = weight_variable([5, 5, 32, 64])
        b_conv2 = bias_variable([64])
        h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
        h_pool2 = max_pool_2x2(h_conv2)

        W_fc1 = weight_variable([7 * 7 * 64, 1024])
        b_fc1 = bias_variable([1024])
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

        W_fc2 = weight_variable([1024, self.NUM_CLASSES])
        b_fc2 = bias_variable([self.NUM_CLASSES])

        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
        y = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

        y = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)

        # training
        self.loss = -tf.reduce_sum(self.y_*tf.log(y))
        self.train = tf.train.AdamOptimizer(lr).minimize(self.loss)

        # evaluation
        self.evaluate = tf.cast(tf.equal(tf.argmax(y,1), tf.argmax(self.y_,1)), tf.float32)

    def transform_batch(self, batch):
        x, y_ = iunzip(batch)
        return {
          self.x: list(x),
          self.y_: list(y_)
        }

class ConvNetEval(ConvNet):
    def __init__(self):
        self.build(keep_prob = 1.)

class ConvNetTrain(ConvNet):
    def __init__(self):
        self.build(
          keep_prob = .5,
          lr = 1e-4)

In [3]:
from tensorflow.examples.tutorials.mnist import input_data

In [4]:
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [12]:
mnist.train.num_examples

55000

In [5]:
import findspark
findspark.init("/home/canwill/spark2/")
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("TensorOnSpark") \
    .config("spark.executor.memory", "5g") \
    .getOrCreate()

In [6]:
partitions = 8

In [7]:
train_data = spark.sparkContext\
  .parallelize(zip(mnist.train.images, mnist.train.labels), partitions)\
  .cache()     

In [8]:
test_data = spark.sparkContext\
  .parallelize(zip(mnist.test.images, mnist.test.labels), partitions)\
  .cache()

In [15]:
train_data.first()

(array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [13]:
master_graph = tf.Graph()

In [14]:
with master_graph.as_default(), tf.Session() as session:
    dg = DistributedGraph(spark.sparkContext, session, partitions, 50, ConvNetEval)
    for i in xrange(4):
        batched_train_data = dg.shuffle_and_batch(train_data, dg.partitions, dg.batch_size, 0.5)
        x = dg.train(batched_train_data, worker_epochs=2, graph_cls=ConvNetTrain)
        print 'Training loss: %.2f' % x
        if i == 0 or ((i+1) % 2 == 0):
            print 'Epoch %i Test Acc: %.3f' % (i, dg.evaluate(test_data, graph_cls=ConvNetEval))

NameError: name 'DistributedGraph' is not defined

In [None]:
import findspark
findspark.init("/home/canwill/spark2/")
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("TensorOnSpark") \
    .config("spark.executor.memory", "5g") \
    .getOrCreate()

In [36]:
from pyspark.sql.functions import lit # Create columns of *literal* value
from pyspark.sql.functions import col

In [23]:
train = spark.read.csv('data/train.csv', header=True)

In [25]:
train.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [26]:
test = spark.read.csv('data/test.csv', header=True)

In [37]:
train = train.withColumn('Mark', lit('train'))
test = (test.withColumn('Survived',lit(0))
                .withColumn('Mark', lit('test')))

In [39]:
## Append Test data to Train data
df = train.unionAll(test)

In [40]:
df.createOrReplaceTempView("train")

In [41]:
df = df.withColumn("AgeTmp", df["Age"].cast("float")) \
    .drop("Age") \
    .withColumnRenamed("AgeTmp", "Age")

In [42]:
def to_anytype(df, colnames, typename):
    for colname in colnames:
        df = df.withColumn("tmp", df[colname].cast(typename)) \
        .drop(colname) \
        .withColumnRenamed("tmp", colname)
    return(df)

In [43]:
intCols = ['PassengerId', 'Pclass', 'SibSp', 'Parch', 'Survived']
floatCols = ['Age', 'Fare']

df = to_anytype(df, intCols, "integer")
df = to_anytype(df, floatCols, "float")

In [44]:


age_hist = spark.sql(
    "SELECT Age AS age, \
            count(*) AS count \
    FROM train \
    GROUP BY Age \
    ORDER BY Age")
age_hist.show(n=age_hist.count())



+----+-----+
| age|count|
+----+-----+
|null|  177|
|   0|  283|
|0.42|    1|
|0.67|    1|
|0.75|    2|
|0.83|    2|
|0.92|    1|
|   1|  117|
|  10|    2|
|  11|    4|
|  12|    1|
|  13|    2|
|  14|    6|
|14.5|    1|
|  15|    5|
|  16|   17|
|  17|   13|
|  18|   26|
|  19|   25|
|   2|   24|
|  20|   15|
|20.5|    1|
|  21|   24|
|  22|   27|
|  23|   15|
|23.5|    1|
|  24|   30|
|24.5|    1|
|  25|   23|
|  26|   18|
|  27|   18|
|  28|   25|
|28.5|    2|
|  29|   20|
|   3|   10|
|  30|   25|
|30.5|    2|
|  31|   17|
|  32|   18|
|32.5|    2|
|  33|   15|
|  34|   15|
|34.5|    1|
|  35|   18|
|  36|   22|
|36.5|    1|
|  37|    6|
|  38|   11|
|  39|   14|
|   4|   14|
|  40|   13|
|40.5|    2|
|  41|    6|
|  42|   13|
|  43|    5|
|  44|    9|
|  45|   12|
|45.5|    2|
|  46|    3|
|  47|    9|
|  48|    9|
|  49|    6|
|   5|    5|
|  50|   10|
|  51|    7|
|  52|    6|
|  53|    1|
|  54|    8|
|  55|    2|
|55.5|    1|
|  56|    4|
|  57|    2|
|  58|    5|
|  59|    2|

In [45]:
age_hist = spark.sql(
    "SELECT bucket_floor, \
        CONCAT(bucket_floor, ' to ', bucket_ceiling) as bucket_name, \
        count(*) as count \
     FROM ( \
        SELECT floor(Age/5.00)*5 as bucket_floor, \
            floor(Age/5.00)*5 + 5 as bucket_ceiling \
        FROM train \
     ) a \
     GROUP BY 1, 2 \
     ORDER BY 1")

age_hist.show(n=age_hist.count())

+------------+-----------+-----+
|bucket_floor|bucket_name|count|
+------------+-----------+-----+
|        null|       null|  177|
|           0|     0 to 5|  455|
|           5|    5 to 10|   25|
|          10|   10 to 15|   16|
|          15|   15 to 20|   86|
|          20|   20 to 25|  114|
|          25|   25 to 30|  106|
|          30|   30 to 35|   95|
|          35|   35 to 40|   72|
|          40|   40 to 45|   48|
|          45|   45 to 50|   41|
|          50|   50 to 55|   32|
|          55|   55 to 60|   16|
|          60|   60 to 65|   15|
|          65|   65 to 70|    4|
|          70|   70 to 75|    6|
|          80|   80 to 85|    1|
+------------+-----------+-----+



In [47]:
numVars = ['Survived','Age','SibSp','Parch','Fare']
stringVars = ['Cabin', 'Embarked', 'Pclass', 'Sex']

def countNull(df,var):
    return df.where(df[var].isNull()).count()

def countEmptyString(df,var):
    return df[df[var].isin("")].count()

def countZero(df,var):
    return df[df[var].isin(0)].count()

In [48]:


missing = {var: countNull(df,var) for var in df.columns}
missing



{'Age': 177,
 'Cabin': 687,
 'Embarked': 2,
 'Fare': 418,
 'Mark': 0,
 'Name': 0,
 'Parch': 122,
 'PassengerId': 0,
 'Pclass': 418,
 'Sex': 86,
 'SibSp': 0,
 'Survived': 0,
 'Ticket': 1}

In [49]:


missing = {var: countEmptyString(df, var) for var in df.columns}
missing



{'Age': 0,
 'Cabin': 0,
 'Embarked': 0,
 'Fare': 0,
 'Mark': 0,
 'Name': 0,
 'Parch': 0,
 'PassengerId': 0,
 'Pclass': 0,
 'Sex': 0,
 'SibSp': 0,
 'Survived': 0,
 'Ticket': 0}

In [50]:
missing = {var: countZero(df, var) for var in df.columns}
missing

{'Age': 283,
 'Cabin': 0,
 'Embarked': 418,
 'Fare': 15,
 'Mark': 0,
 'Name': 0,
 'Parch': 678,
 'PassengerId': 0,
 'Pclass': 0,
 'Sex': 0,
 'SibSp': 932,
 'Survived': 549,
 'Ticket': 2}

In [51]:
age_mean = df.groupBy().mean('Age').first()
age_mean

Row(avg(Age)=18.897676678433644)

In [53]:

age_mean = df.groupBy().mean('Age').first()[0]
fare_mean = df.groupBy().mean('Fare').first()[0]
age_mean, fare_mean



(18.897676678433644, 32.20420804114722)

In [54]:
df = df.na.fill({'Age':age_mean,'Fare':fare_mean})

In [55]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
 
## create user defined function to extract title
getTitle = udf(lambda name: name.split('.')[0].strip(), StringType())
df = df.withColumn('Title', getTitle(df['Name']))
 
df.select('Name','Title').show(5)

+--------------------+---------------+
|                Name|          Title|
+--------------------+---------------+
|Braund, Mr. Owen ...|     Braund, Mr|
|Cumings, Mrs. Joh...|   Cumings, Mrs|
|Heikkinen, Miss. ...|Heikkinen, Miss|
|Futrelle, Mrs. Ja...|  Futrelle, Mrs|
|Allen, Mr. Willia...|      Allen, Mr|
+--------------------+---------------+
only showing top 5 rows



In [56]:
getTitle = udf(lambda name: name.split('.')[0].split(',')[1].strip(), StringType())
df = df.withColumn('Title', getTitle(df['Name']))
 
df.select('Name','Title').show(5)

+--------------------+-----+
|                Name|Title|
+--------------------+-----+
|Braund, Mr. Owen ...|   Mr|
|Cumings, Mrs. Joh...|  Mrs|
|Heikkinen, Miss. ...| Miss|
|Futrelle, Mrs. Ja...|  Mrs|
|Allen, Mr. Willia...|   Mr|
+--------------------+-----+
only showing top 5 rows



In [58]:
from pyspark.sql import DataFrameNaFunctions
from pyspark.sql.functions import lit # Create columns of *literal* value
from pyspark.sql.functions import col # Returns a Column based on the 
                                      # given column name
from pyspark.ml.feature import StringIndexer #label encoding
from pyspark.ml import Pipeline


In [59]:
catVars = ['Pclass','Sex','Embarked','Title']
 
## index Sex variable
si = StringIndexer(inputCol = 'Sex', outputCol = 'Sex_indexed')
df_indexed = si.fit(df).transform(df).drop('Sex').withColumnRenamed('Sex_indexed','Sex')

Py4JJavaError: An error occurred while calling o460.transform.
: java.lang.NullPointerException
	at org.apache.spark.sql.types.Metadata$.org$apache$spark$sql$types$Metadata$$hash(Metadata.scala:219)
	at org.apache.spark.sql.types.Metadata$$anonfun$org$apache$spark$sql$types$Metadata$$hash$2.apply(Metadata.scala:207)
	at org.apache.spark.sql.types.Metadata$$anonfun$org$apache$spark$sql$types$Metadata$$hash$2.apply(Metadata.scala:207)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.AbstractTraversable.map(Traversable.scala:104)
	at org.apache.spark.sql.types.Metadata$.org$apache$spark$sql$types$Metadata$$hash(Metadata.scala:207)
	at org.apache.spark.sql.types.Metadata$$anonfun$org$apache$spark$sql$types$Metadata$$hash$1.apply(Metadata.scala:204)
	at org.apache.spark.sql.types.Metadata$$anonfun$org$apache$spark$sql$types$Metadata$$hash$1.apply(Metadata.scala:204)
	at scala.collection.MapLike$MappedValues$$anonfun$foreach$3.apply(MapLike.scala:245)
	at scala.collection.MapLike$MappedValues$$anonfun$foreach$3.apply(MapLike.scala:245)
	at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
	at scala.collection.immutable.Map$Map3.foreach(Map.scala:161)
	at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
	at scala.collection.MapLike$MappedValues.foreach(MapLike.scala:245)
	at scala.util.hashing.MurmurHash3.unorderedHash(MurmurHash3.scala:91)
	at scala.util.hashing.MurmurHash3$.mapHash(MurmurHash3.scala:222)
	at scala.collection.GenMapLike$class.hashCode(GenMapLike.scala:35)
	at scala.collection.AbstractMap.hashCode(Map.scala:59)
	at scala.runtime.ScalaRunTime$.hash(ScalaRunTime.scala:206)
	at org.apache.spark.sql.types.Metadata$.org$apache$spark$sql$types$Metadata$$hash(Metadata.scala:204)
	at org.apache.spark.sql.types.Metadata$$anonfun$org$apache$spark$sql$types$Metadata$$hash$1.apply(Metadata.scala:204)
	at org.apache.spark.sql.types.Metadata$$anonfun$org$apache$spark$sql$types$Metadata$$hash$1.apply(Metadata.scala:204)
	at scala.collection.MapLike$MappedValues$$anonfun$foreach$3.apply(MapLike.scala:245)
	at scala.collection.MapLike$MappedValues$$anonfun$foreach$3.apply(MapLike.scala:245)
	at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
	at scala.collection.immutable.Map$Map1.foreach(Map.scala:116)
	at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
	at scala.collection.MapLike$MappedValues.foreach(MapLike.scala:245)
	at scala.util.hashing.MurmurHash3.unorderedHash(MurmurHash3.scala:91)
	at scala.util.hashing.MurmurHash3$.mapHash(MurmurHash3.scala:222)
	at scala.collection.GenMapLike$class.hashCode(GenMapLike.scala:35)
	at scala.collection.AbstractMap.hashCode(Map.scala:59)
	at scala.runtime.ScalaRunTime$.hash(ScalaRunTime.scala:206)
	at org.apache.spark.sql.types.Metadata$.org$apache$spark$sql$types$Metadata$$hash(Metadata.scala:204)
	at org.apache.spark.sql.types.Metadata._hashCode$lzycompute(Metadata.scala:107)
	at org.apache.spark.sql.types.Metadata._hashCode(Metadata.scala:107)
	at org.apache.spark.sql.types.Metadata.hashCode(Metadata.scala:108)
	at org.apache.spark.sql.catalyst.expressions.AttributeReference.hashCode(namedExpressions.scala:249)
	at scala.runtime.ScalaRunTime$.hash(ScalaRunTime.scala:206)
	at scala.collection.immutable.HashSet.elemHashCode(HashSet.scala:177)
	at scala.collection.immutable.HashSet.computeHash(HashSet.scala:186)
	at scala.collection.immutable.HashSet.$plus(HashSet.scala:84)
	at scala.collection.immutable.HashSet.$plus(HashSet.scala:35)
	at scala.collection.mutable.SetBuilder.$plus$eq(SetBuilder.scala:22)
	at scala.collection.mutable.SetBuilder.$plus$eq(SetBuilder.scala:20)
	at scala.collection.generic.Growable$class.loop$1(Growable.scala:53)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:57)
	at scala.collection.mutable.SetBuilder.$plus$plus$eq(SetBuilder.scala:20)
	at scala.collection.TraversableLike$class.to(TraversableLike.scala:590)
	at scala.collection.AbstractTraversable.to(Traversable.scala:104)
	at scala.collection.TraversableOnce$class.toSet(TraversableOnce.scala:304)
	at scala.collection.AbstractTraversable.toSet(Traversable.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.containsChild$lzycompute(TreeNode.scala:89)
	at org.apache.spark.sql.catalyst.trees.TreeNode.containsChild(TreeNode.scala:89)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5$$anonfun$apply$11.apply(TreeNode.scala:359)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.immutable.List.map(List.scala:285)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:358)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:295)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionDown$1(QueryPlan.scala:248)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:258)
	at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$6.apply(QueryPlan.scala:267)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsDown(QueryPlan.scala:267)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressions(QueryPlan.scala:236)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer$$anonfun$apply$32.applyOrElse(Analyzer.scala:2027)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer$$anonfun$apply$32.applyOrElse(Analyzer.scala:2023)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:61)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:61)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:60)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer$.apply(Analyzer.scala:2023)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer$.apply(Analyzer.scala:2022)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
	at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
	at scala.collection.immutable.List.foldLeft(List.scala:84)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
	at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.resolveAndBind(ExpressionEncoder.scala:258)
	at org.apache.spark.sql.Dataset.<init>(Dataset.scala:209)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:2822)
	at org.apache.spark.sql.Dataset.select(Dataset.scala:1121)
	at org.apache.spark.ml.feature.StringIndexerModel.transform(StringIndexer.scala:185)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [30]:
from tensorflow.contrib import learn

In [31]:
import random

In [None]:
classifier = learn.LinearClassifier(n_classes=2, 
feature_columns=learn.infer_real_valued_columns_from_input(X_train), optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05))