# Regression and Classification Algorithms

Dataset Source - 1. Provided in this course by Coursera (For regression, classification and gradient descent without                                 Tensorflow)
 
              2. Link - https://www.tensorflow.org/datasets/catalog/imdb_reviews (Classification using Tensorflow)

Dataset Format - JSON, TFRECORD

Analysis by - RAITI LIKHIT ADARSH

Comments - 1. Predicting labels by deploying ML Algorithms by implementing both traditional as well as libraries

           2. Text Classification - To predict whether a movie has positive or negative reviews (1 & 0 respectively) by it's                   user reviews

# Regression

## 1. Linear Regression

In [1]:
import urllib.request # read and open URLs
from sklearn import linear_model
import numpy

In [2]:
def parseData(filename):
  for line in urllib.request.urlopen(filename):
    yield eval(line)

# Processing the data
print ("Reading data...")
data = list(parseData("file:/C:/Users/KIIT/Desktop/AI & Data Science/Coursera Data Science 2/Final_Course2/datasets/business.json"))
print ("Done!")

# To see how our features look like
data[0]

Reading data...
Done!


{'business_id': '1SWheh84yJXfytovILXOAQ',
 'name': 'Arizona Biltmore Golf Club',
 'address': '2818 E Camino Acequia Drive',
 'city': 'Phoenix',
 'state': 'AZ',
 'postal_code': '85016',
 'latitude': 33.5221425,
 'longitude': -112.0184807,
 'stars': 3.0,
 'review_count': 5,
 'is_open': 0,
 'attributes': {'GoodForKids': 'False'},
 'categories': 'Golf, Active Life',
 'hours': ''}

In [3]:
# creating a feature vector
def feature(datum):
    feat = [1, datum['stars'], datum['review_count']]
    return feat

In [4]:
X = [feature(d) for d in data]
y = [d['is_open'] for d in data]

# printing first 10 rows of labels and features
print("Label: ", y[:10], "\nFeatures:", X[:10])

Label:  [0, 1, 1, 1, 1, 1, 1, 1, 0, 1] 
Features: [[1, 3.0, 5], [1, 2.5, 128], [1, 4.0, 170], [1, 5.0, 3], [1, 4.0, 4], [1, 2.5, 3], [1, 3.5, 7], [1, 3.5, 3], [1, 5.0, 8], [1, 4.5, 8]]


In [5]:
# using numpy to compute the inverse of the matrix of features and multiply to label matrix to get the values of parameters

theta,residuals,rank,s = numpy.linalg.lstsq(X, y, rcond=None)
theta

array([7.45643273e-01, 1.97216675e-02, 1.16052841e-04])

## 2. Autoregression

In [6]:
# setting up a group of features with size of 24

def feature(data, index, windowSize):
    feat = [1]
    previousValues_stars = [d['stars'] for d in data[index - windowSize:index]]
    previousValues_review_count = [d['review_count'] for d in data[index - windowSize:index]]
    return feat + previousValues_stars + previousValues_review_count

windowSize = 24;      # Window = one day
N = len(data)      # defines limit of features

In [7]:
X1 = [feature(data, index, windowSize) for index in range(windowSize, N)]
y1 = [d['is_open'] for d in data[windowSize:]]

# checking how our features and labels look like.
print("Features: ", X1[0], "\nLabels: ", y1[0])

Features:  [1, 3.0, 2.5, 4.0, 5.0, 4.0, 2.5, 3.5, 3.5, 5.0, 4.5, 2.0, 3.0, 3.5, 4.0, 3.0, 4.0, 5.0, 4.0, 5.0, 4.0, 4.5, 3.5, 4.0, 4.0, 5, 128, 170, 3, 4, 3, 7, 3, 8, 8, 5, 18, 9, 16, 7, 4, 5, 40, 21, 23, 38, 3, 107, 35] 
Labels:  1


In [8]:
theta,residuals,rank,s = numpy.linalg.lstsq(X1, y1, rcond=None)
theta

array([ 9.65076608e-01, -5.87902764e-03, -1.06559261e-03,  1.08438561e-03,
       -2.69895991e-03,  6.89204031e-03,  5.61208730e-03, -4.65594763e-03,
        3.74048971e-03, -4.76569972e-03, -2.19269326e-03, -5.55376891e-04,
        2.60470267e-03,  2.26551687e-03, -5.08398160e-03, -5.76443526e-03,
        2.93587467e-03, -5.76692329e-03, -8.26645402e-03, -4.80394296e-03,
        1.22147445e-03, -5.70325858e-03,  3.72575297e-03, -5.98411304e-03,
       -8.83806763e-03,  3.15459134e-05,  1.71887675e-06,  2.93341974e-05,
       -4.26314962e-05,  2.78894595e-05,  6.53801408e-06,  1.73375147e-05,
        2.31356361e-05, -8.62298825e-06,  2.96233449e-05,  1.37953101e-05,
       -2.30419474e-05,  4.02874776e-06, -1.15050697e-05, -2.92215176e-05,
        4.38447794e-06,  1.35651053e-05,  2.20669073e-05,  2.03442396e-05,
        7.63518335e-05, -2.26207444e-05,  2.08075308e-05,  1.67948802e-05,
       -2.83917584e-05])

# Classification

## 1. Classification using sklearn module (Logistic Regression)

In [9]:
model = linear_model.LogisticRegression()

In [10]:
model.fit(X,y)

LogisticRegression()

In [11]:
predictions = model.predict(X) # at this stage predictions will be made on the basis of provided features
predictions

array([1, 1, 1, ..., 1, 1, 1])

In [12]:
correctPredictions = predictions == y # here our predictions are verified with the actual labels
correctPredictions

array([False,  True,  True, ...,  True,  True, False])

In [13]:
sum(correctPredictions)/len(correctPredictions) # here we compute our efficiency, which is well over 83%

0.8204

## 2. Nearest neighbours (randomized dataset initialisation with training and testing sets)

In [14]:
from sklearn.neighbors import KNeighborsClassifier
import random

In [15]:
def feature_nn(datum):
    feat = [1, datum['stars'], datum['review_count'], datum['is_open']] #keeping both features and labels together so that shuffling can be done conveniently
    return feat

dataset = [feature_nn(d) for d in data]
dataset[0]

[1, 3.0, 5, 0]

In [16]:
random.shuffle(dataset)

In [17]:
X_nn = [values[:-1] for values in dataset]
y_nn = [values[-1] for values in dataset]

In [18]:
# creating training and testing datasets with half of the total datasets
N = len(X_nn)
X_nn_train = X[:N//2]
X_nn_test = X[N//2:]
y_nn_train = y[:N//2]
y_nn_test = y[N//2:]

len(X_nn), len(X_nn_train), len(X_nn_test)

(10000, 5000, 5000)

In [19]:
neigh = KNeighborsClassifier(n_neighbors=19) # we can modify the n_neighbors values to get different accuracy. For this example, 19 suits the best
neigh.fit(X_nn_train, y_nn_train) 

KNeighborsClassifier(n_neighbors=19)

In [20]:
# Model's predictions
predictionsTrain_nn = neigh.predict(X_nn_train)
predictionsTest_nn = neigh.predict(X_nn_test)

# Whether model prediction was correct
correctPredictionsTrain_nn = predictionsTrain_nn == y_nn_train
correctPredictionsTest_nn = predictionsTest_nn == y_nn_test

print("Training Accuracy: ", sum(correctPredictionsTrain_nn) / len(correctPredictionsTrain_nn)) # Training accuracy
print("Testing Accuracy: ", sum(correctPredictionsTest_nn) / len(correctPredictionsTest_nn)) # Test accuracy

Training Accuracy:  0.8154
Testing Accuracy:  0.825


# Gradient Descent

## 1. without Tensorflow

In [21]:
K = len(X[0])
theta = [0.0]*K
theta[0] = sum(y)/len(y)
theta[0]

0.8204

In [22]:
#Defining the inner product of a vector
def inner(x,y):
    return sum([a*b for (a,b) in zip(x,y)])

#Defining the 2-norm of a vector
def norm(x):
    return sum([a*a for a in x])

In [23]:
def derivative(X, y, theta):
    dtheta = [0.0]*len(theta) # Initialize the derivative vector to be a vector of zeros
    K = len(theta)
    N = len(X)
    MSE = 0 # Compute the MSE as we go
    for i in range(N):
        error = inner(X[i],theta) - y[i]
        for k in range(K):
            dtheta[k] += 2*X[i][k]*error/N
        MSE += error*error/N
    return dtheta, MSE

In [24]:
learningRate = 0.01

In [25]:
while (True):
    dtheta,MSE = derivative(X, y, theta)
    m = norm(dtheta)
    print("norm(dtheta) = " + str(m) + " MSE = " + str(MSE))
    for k in range(K):
        theta[k] -= learningRate * dtheta[k]
    if m < 0.01 or str(m) == 'inf': break

norm(dtheta) = 7.2223843458204335 MSE = 0.14734383999997866
norm(dtheta) = 409184.07184693025 MSE = 8.706558902734299
norm(dtheta) = 23183086660.908123 MSE = 484947.12356932246
norm(dtheta) = 1313481008522558.8 MSE = 27475575316.451622
norm(dtheta) = 7.441771602650993e+19 MSE = 1556679958284007.8
norm(dtheta) = 4.2162744818300776e+24 MSE = 8.819660606287831e+19
norm(dtheta) = 2.3888089362753165e+29 MSE = 4.996943192861162e+24
norm(dtheta) = 1.3534242513432216e+34 MSE = 2.831111353069549e+29
norm(dtheta) = 7.668077493799638e+38 MSE = 1.6040189340014617e+34
norm(dtheta) = 4.3444923048009444e+43 MSE = 9.087868401381243e+38
norm(dtheta) = 2.4614531349920208e+48 MSE = 5.148901320933365e+43
norm(dtheta) = 1.394582176855688e+53 MSE = 2.9172060643703755e+48
norm(dtheta) = 7.90126539626263e+57 MSE = 1.6527974982548564e+53
norm(dtheta) = 4.476609259623338e+62 MSE = 9.364232453790814e+57
norm(dtheta) = 2.5363064595736454e+67 MSE = 5.305480528692776e+62
norm(dtheta) = 1.4369917238244556e+72 MSE = 

In [26]:
theta

[-2.184756638873396e+150, -8.133107410175173e+150, -8.094618232776827e+152]

## 2. with Tensorflow (Classification with Tensorflow)

In [27]:
!pip install -q tensorflow-hub
!pip install -q tensorflow-datasets

In [28]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import os

In [29]:
# Split the training set into 60% and 40%, so we'll end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

In [30]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_examples_batch

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell 

In [31]:
train_labels_batch

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0], dtype=int64)>

#### Here the input to our model is a text. To process text is to convert it into an embedding vector as the first input layer to our model

In [32]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2" #this contains pre-trained texts as inputs for this model
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

<tf.Tensor: shape=(3, 50), dtype=float32, numpy=
array([[ 0.5423195 , -0.0119017 ,  0.06337538,  0.06862972, -0.16776837,
        -0.10581174,  0.16865303, -0.04998824, -0.31148055,  0.07910346,
         0.15442263,  0.01488662,  0.03930153,  0.19772711, -0.12215476,
        -0.04120981, -0.2704109 , -0.21922152,  0.26517662, -0.80739075,
         0.25833532, -0.3100421 ,  0.28683215,  0.1943387 , -0.29036492,
         0.03862849, -0.7844411 , -0.0479324 ,  0.4110299 , -0.36388892,
        -0.58034706,  0.30269456,  0.3630897 , -0.15227164, -0.44391504,
         0.19462997,  0.19528408,  0.05666234,  0.2890704 , -0.28468323,
        -0.00531206,  0.0571938 , -0.3201318 , -0.04418665, -0.08550783,
        -0.55847436, -0.23336391, -0.20782952, -0.03543064, -0.17533456],
       [ 0.56338924, -0.12339553, -0.10862679,  0.7753425 , -0.07667089,
        -0.15752277,  0.01872335, -0.08169781, -0.3521876 ,  0.4637341 ,
        -0.08492756,  0.07166859, -0.00670817,  0.12686075, -0.19326553,
 

In [33]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu')) # using rectified linear unit (RELU) as our activation function
model.add(tf.keras.layers.Dense(1))

model.summary()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 50)                48190600  
_________________________________________________________________
dense (Dense)                (None, 16)                816       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 48,191,433
Trainable params: 48,191,433
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [35]:
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=10,
                    validation_data=validation_data.batch(512), # there are 10 epochs with 512 samples tested for each epoch
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

49/49 - 2s - loss: 0.3553 - accuracy: 0.8546
loss: 0.355
accuracy: 0.855


#### hence we have achieved an accuracy of fairly 86% of predicting whether a movie is a hit or miss by training the data with all the texts provided by users and to use that pre-trained text to predict whether the next movie is a hit or not with the information we gathered so far.