<a href="https://colab.research.google.com/github/pratyushgoyal2704/Data-Science-CB-OL-/blob/master/TensorFlow_with_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tensorflow with GPU

This notebook provides an introduction to computing on a [GPU](https://cloud.google.com/gpu) in Colab. In this notebook you will connect to a GPU, and then run some basic TensorFlow operations on both the CPU and a GPU, observing the speedup provided by using the GPU.


In [6]:
import numpy as np
import pandas as pd

import nltk
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.utils import class_weight
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
import tensorflow_hub as hub

from keras import backend as K
from keras.engine import Layer
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, LSTM, GRU, LeakyReLU, Dropout
from keras.layers import CuDNNLSTM, CuDNNGRU, Embedding, Bidirectional
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

import matplotlib.pyplot as plt

In [7]:
# df = pd.read_csv('C:/Users/pratyush/capstone/p_train.csv', low_memory=False)
from google.colab import files
uploaded = files.upload()

Saving p_train.csv to p_train.csv


In [8]:
import io
df = pd.read_csv(io.BytesIO(uploaded['p_train.csv']))

In [9]:
labels = df[['id', 'sentiment']]

In [10]:
classes = sorted(labels.sentiment.unique())

In [11]:
df.drop(['n', 'sentiment'], axis=1, inplace=True)

In [12]:
label_to_cat = dict()
for i in range(len(classes)):
    dummy = np.zeros((len(classes),), dtype='int8')
    dummy[i] = 1
    label_to_cat[classes[i]] = dummy

In [13]:
cat_to_label = dict()
for k, v in label_to_cat.items():
    cat_to_label[tuple(v)] = k

In [14]:
y = np.array([label_to_cat[label] for label in labels.sentiment])

In [15]:
df.response = df.response.apply(str.lower)

In [16]:
def tokenize(df):
    df['tokens'] = df['response'].map(lambda x: nltk.word_tokenize(x))

In [19]:
tokenize(df)

In [18]:
# import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
df_train, df_val, y_train, y_val = train_test_split(df, y, test_size=0.15, random_state=42)

In [21]:
t = Tokenizer()
t.fit_on_texts(df_train.response)
vocab_size = len(t.word_index) + 1

In [22]:
vocab_size

19142

In [23]:
encoded_train_set = t.texts_to_sequences(df_train.response)
len(encoded_train_set)

46172

In [24]:
df_train['tokens'] = encoded_train_set
df_train.drop(['response'], axis=1, inplace=True)
df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,id,period,tokens
26164,33229,24h,"[1, 1087, 9, 154, 12, 2, 272, 39]"
39039,34026,24h,"[2, 39, 78, 108, 5, 235, 2887, 1515]"
43093,35623,24h,"[1, 251, 13, 1, 7, 28, 11, 397, 8, 297, 64, 56..."
3161,58661,3m,"[1, 636, 3, 27, 1538, 11303, 13, 133, 219, 10,..."
29640,59683,3m,"[1, 65, 33, 8604, 3849, 11, 1088]"


In [25]:
SEQ_LEN = 80
padded_train = pad_sequences(encoded_train_set, maxlen=SEQ_LEN, padding='post')

In [26]:
train_docs = [list(doc) for doc in padded_train]
df_train['tokens'] = train_docs
df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,period,tokens
26164,33229,24h,"[1, 1087, 9, 154, 12, 2, 272, 39, 0, 0, 0, 0, ..."
39039,34026,24h,"[2, 39, 78, 108, 5, 235, 2887, 1515, 0, 0, 0, ..."
43093,35623,24h,"[1, 251, 13, 1, 7, 28, 11, 397, 8, 297, 64, 56..."
3161,58661,3m,"[1, 636, 3, 27, 1538, 11303, 13, 133, 219, 10,..."
29640,59683,3m,"[1, 65, 33, 8604, 3849, 11, 1088, 0, 0, 0, 0, ..."


In [29]:
# embedding_index = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/pratyush/capstone/glove.6B.300d.txt.word2vec', binary=False)

In [27]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth 
from oauth2client.client import GoogleCredentials

In [28]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [29]:
downloaded = drive.CreateFile({'id':"1IdCkfTI8QenwqkWivzG_wtTRX6kHm8zi"})   # replace the id with id of file you want to access
downloaded.GetContentFile('glove.6B.300d.txt.word2vec')

In [30]:
embedding_index = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.300d.txt.word2vec', binary=False) 

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [34]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
embedding_matrix = np.zeros((vocab_size, 300))
count = 0

for word, i in t.word_index.items():
    try:
        embedding_vector = embedding_index[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        count += 1
        
count

1877

In [36]:
def recall(y_true, y_pred):
    true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_pos = K.sum(K.round(K.clip(y_true, 0, 1)))
    _recall = true_pos / (possible_pos + K.epsilon())
    return _recall
    
def precision(y_true, y_pred):
    true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_pos = K.sum(K.round(K.clip(y_pred, 0, 1)))
    _precision = true_pos / (predicted_pos + K.epsilon())
    return _precision

def f1(y_true, y_pred):    
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r + K.epsilon()))

Num GPUs Available:  1


In [37]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x = Dense(64, activation='relu')(x) 
output_tensor = Dense(7, activation='softmax')(x)
model = Model(input_tensor, output_tensor)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [38]:
model.compile(optimizer=Adam(lr=1e-3),
              loss='categorical_crossentropy',
              metrics=['accuracy', f1])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 80, 300)           5742600   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 80, 256)           440320    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               164864    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 455       
Total params: 6,356,495
Trainable params: 613,895
Non-trainable params: 5,742,600
___________________________________________

In [39]:
x_train = np.array([np.array(token) for token in df_train.tokens])
x_train.shape

(46172, 80)

In [40]:

model.fit(x_train, y_train, epochs=10, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fcb804cd4e0>

In [41]:
encoded_val_set = t.texts_to_sequences(df_val.response)
len(encoded_val_set)

8149

In [42]:
df_val['tokens'] = encoded_val_set
padded_val = pad_sequences(encoded_val_set, maxlen=SEQ_LEN, padding='post')
val_vectors = [list(doc) for doc in padded_val]
df_val.tokens = val_vectors
df_val.head()

Unnamed: 0,id,period,response,tokens
3419,37301,24h,my son got to play in a big soccer game after ...,"[2, 73, 17, 5, 190, 9, 3, 173, 737, 107, 43, 1..."
18762,85992,3m,i got to go to sleep early yesterday.,"[1, 17, 5, 82, 5, 270, 262, 68, 0, 0, 0, 0, 0,..."
16492,50424,24h,i saw my dad were waiting for me with a glass ...,"[1, 131, 2, 281, 87, 344, 8, 10, 12, 3, 1414, ..."
21122,45067,24h,i found a really good deal at the grocery outl...,"[1, 65, 3, 46, 40, 509, 20, 6, 569, 5654, 254,..."
32679,56965,24h,"today i saw a tv show about the band kiss, i w...","[51, 1, 131, 3, 301, 237, 54, 6, 718, 892, 1, ..."


In [43]:
x_val = np.array([np.array(token) for token in df_val.tokens])
print(x_val.shape, y_val.shape) 

(8149, 80) (8149, 7)


In [44]:
score = model.evaluate(x_val, y_val, verbose=1)
score



[0.53795284054399, 0.8905386924743652, 0.8904886245727539]

In [1]:
tensorflow_version 1.x

TensorFlow 1.x selected.


In [5]:
pip show tensorflow

Name: tensorflow
Version: 1.15.2
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /tensorflow-1.15.2/python3.6
Requires: termcolor, gast, astor, tensorflow-estimator, absl-py, grpcio, keras-preprocessing, google-pasta, protobuf, opt-einsum, keras-applications, tensorboard, numpy, six, wheel, wrapt
Required-by: stable-baselines, magenta, fancyimpute
