# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint


### Objective

- To understand several techniques in Text representation

In [None]:
#@title Experiment Walkthrough Video
from IPython.display import HTML

HTML("""<video width="900" height="400" controls>
  <source src="https://cdn.exec.talentsprint.com/content/Text_representation.mp4" type="video/mp4">
</video>
""")

### Dataset
   Here we will be using Movies_review data which contains 50000 reviews. The training data and testing are split evenly, 25k reviews under reviews_train and 25k under reviews_test.
Under each file first 12500 reviews are positive and remaining 12500 are negative reviews.



### Setup Steps:

In [None]:
#@title Please enter your registration id to start: { run: "auto", display-mode: "form" }
Id = "" #@param {type:"string"}

In [None]:
#@title Please enter your password (normally your phone number) to continue: { run: "auto", display-mode: "form" }
password = "" #@param {type:"string"}

In [None]:
#@title Run this cell to complete the setup for this Notebook
from IPython import get_ipython
import re
ipython = get_ipython()

notebook= "U1W4_17_Text_representation_using_Sckit_learn_B" #name of the notebook

def setup():
#  ipython.magic("sx pip3 install torch")
    from IPython.display import HTML, display
    ipython.magic("sx wget -qq https://cdn.talentsprint.com/aiml/movie_data.tar.gz")
    display(HTML('<script src="https://dashboard.talentsprint.com/aiml/record_ip.html?traineeId={0}&recordId={1}"></script>'.format(getId(),submission_id)))
    print("Setup completed successfully")
    return

def submit_notebook():
    ipython.magic("notebook -e "+ notebook + ".ipynb")

    import requests, json, base64, datetime

    url = "https://dashboard.talentsprint.com/xp/app/save_notebook_attempts"
    if not submission_id:
      data = {"id" : getId(), "notebook" : notebook, "mobile" : getPassword()}
      r = requests.post(url, data = data)
      r = json.loads(r.text)

      if r["status"] == "Success":
          return r["record_id"]
      elif "err" in r:
        print(r["err"])
        return None
      else:
        print ("Something is wrong, the notebook will not be submitted for grading")
        return None

    elif getAnswer() and getComplexity() and getAdditional() and getConcepts() and getWalkthrough() and getComments() and getMentorSupport():
      f = open(notebook + ".ipynb", "rb")
      file_hash = base64.b64encode(f.read())

      data = {"complexity" : Complexity, "additional" :Additional,
              "concepts" : Concepts, "record_id" : submission_id,
              "answer" : Answer, "id" : Id, "file_hash" : file_hash,
              "notebook" : notebook, "feedback_walkthrough":Walkthrough ,
              "feedback_experiments_input" : Comments,
              "feedback_inclass_mentor": Mentor_support}

      r = requests.post(url, data = data)
      r = json.loads(r.text)
      if "err" in r:
        print(r["err"])
        return None
      else:
        print("Your submission is successful.")
        print("Ref Id:", submission_id)
        print("Date of submission: ", r["date"])
        print("Time of submission: ", r["time"])
        print("View your submissions: https://learn-iiith.talentsprint.com/notebook_submissions")
        #print("For any queries/discrepancies, please connect with mentors through the chat icon in LMS dashboard.")
        return submission_id
    else: submission_id


def getAdditional():
  try:
    if not Additional:
      raise NameError
    else:
      return Additional
  except NameError:
    print ("Please answer Additional Question")
    return None

def getComplexity():
  try:
    if not Complexity:
      raise NameError
    else:
      return Complexity
  except NameError:
    print ("Please answer Complexity Question")
    return None

def getConcepts():
  try:
    if not Concepts:
      raise NameError
    else:
      return Concepts
  except NameError:
    print ("Please answer Concepts Question")
    return None


def getWalkthrough():
  try:
    if not Walkthrough:
      raise NameError
    else:
      return Walkthrough
  except NameError:
    print ("Please answer Walkthrough Question")
    return None

def getComments():
  try:
    if not Comments:
      raise NameError
    else:
      return Comments
  except NameError:
    print ("Please answer Comments Question")
    return None


def getMentorSupport():
  try:
    if not Mentor_support:
      raise NameError
    else:
      return Mentor_support
  except NameError:
    print ("Please answer Mentor support Question")
    return None

def getAnswer():
  try:
    if not Answer:
      raise NameError
    else:
      return Answer
  except NameError:
    print ("Please answer Question")
    return None


def getId():
  try:
    return Id if Id else None
  except NameError:
    return None

def getPassword():
  try:
    return password if password else None
  except NameError:
    return None

submission_id = None
### Setup
if getPassword() and getId():
  submission_id = submit_notebook()
  if submission_id:
    setup()
else:
  print ("Please complete Id and Password cells before running setup")



### Extarct data

In [None]:
# Extract the files from the downloaded folder
import tarfile
my_tar = tarfile.open("movie_data.tar.gz")  # Open the tarfile
my_tar.extractall("/content/")          # Specify the folder and extract the files in the Specified folder
my_tar.close()

### Importing required packages


In [None]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Read each line and append to a list
reviews_train = []

for line in open("/content/movie_data/full_train.txt", "r"):
    reviews_train.append(line.strip())

reviews_test = []

for line in open("/content/movie_data/full_test.txt", "r"):
    reviews_test.append(line.strip())

In [None]:
# Read the 20000th review from train text file
reviews_train[19999]

In [None]:
Replace_without_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")    # All these characters in text will be removed
Replace_with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")            # All these characters in text will be replaced by space
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    reviews = [Replace_without_space.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [Replace_with_space.sub(SPACE, line) for line in reviews]
    return np.array(reviews)

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [None]:
# Verify the 20000th review from train text file
reviews_train_clean[19999]

Give labels for the movie reviews, where first 12500 reviews are positive and remaining 12500 are negative reviews.

In [None]:
target = np.array([1 if i < 12500 else 0 for i in range(25000)])  # Labeling positive reviews as 1 and negative reviews as 0
print(target.shape, target[345], target[20000])

### CountVectorizer


Using N-grams get the consecutive words from the given text and get the feature vector using the countvectorizer for the same.

In [None]:
"""To get binary values (1 for present or 0 for absent) instead of counts of terms/tokens, give binary=True.
N-Gram range basically lets you decide the length of the sequence of consecutive words in the given text. Suppose the n-gram range = (1, 3).
Then it will pick the unigram(only single word), bigram (group of 2 consecutive words), and the trigram (group of 3 consecutive words)."""

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)                         # Tokenize and build vocab
train_vec = # YOUR CODE HERE: To tranfsorm get feature vector for train data
test_vec = # YOUR CODE HERE: To transform and get feature vector for test data

#### Split the review_train data into train and test sets

Hint: Refer to[Train-Test split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [None]:
# Split the train and test sets
X_train,X_test, y_train,y_test = # YOUR CODE HERE: To split the train and test data with 75-25%

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Apply the Decision Tree Classifier for the splitted review_train data
Note: Below code cell take some time to compile

In [None]:
# Create an object for the DecisionTreeClassifier
decisiontree = DecisionTreeClassifier()

# Fit the model and get the predictions
decisiontree.fit(X_train,y_train)

# Predict the model
predict = decisiontree.predict(X_test)

# Calculate the accuracy
accuracy_score(y_test, predict)


In [None]:
# Use the trained model to get the predictions on the review_test data
predict = decisiontree.predict(test_vec)
accuracy_score(target, predict)

### TF IDF
 tf-idf aims to represent the number of times a given word appears in a document (a movie review in our case) relative to the number of documents in the corpus that the word appears in â€” where, words that appear in many documents have a value closer to zero and words that appear in less documents have values closer to 1.

We have seen how to get the consecutive words using n-grams, similarly you can try without using n-grams


In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X_train_tfidf = # YOUR CODE HERE: To transform and get feature vector for train data
X_test_tfidf = # YOUR CODE HERE: To transform and get feature vector for test data

#### Split the review_train data into train and test sets

Hint: Refer to [Train-Test split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [None]:
# Split the train and test sets
X1_train, X1_test, y1_train, y1_test = # YOUR CODE HERE: To split the train and test data with 75-25%


#### Apply the Decision Tree Classifier
Note: Below code cell take some time to complie

In [None]:
# Create an object of DecisionTreeClassifier
decisiontree = DecisionTreeClassifier()

# Fit the model and get the predictions
decisiontree.fit(X1_train,y1_train)

# Predict the model
predict = decisiontree.predict(X1_test)

# Calculate the accuracy
accuracy_score(y1_test, predict)


In [None]:
# Use the trained model to get the predictions on the review_test data
predict = decisiontree.predict(X_test_tfidf)
accuracy_score(target, predict)

### Please answer the questions below to complete the experiment:




In [None]:
#@title Using N-grams for: Sentence representation, Word prediction (auto-complete), Ambiguity resolution (Speech recognition, OCR), Machine Translation (choosing one sentence over another). { run: "auto", form-width: "500px", display-mode: "form" }
Answer = "" #@param ["","TRUE", "FALSE"]


In [None]:
#@title How was the experiment? { run: "auto", form-width: "500px", display-mode: "form" }
Complexity = "" #@param ["","Too Simple, I am wasting time", "Good, But Not Challenging for me", "Good and Challenging for me", "Was Tough, but I did it", "Too Difficult for me"]


In [None]:
#@title If it was too easy, what more would you have liked to be added? If it was very difficult, what would you have liked to have been removed? { run: "auto", display-mode: "form" }
Additional = "" #@param {type:"string"}


In [None]:
#@title Can you identify the concepts from the lecture which this experiment covered? { run: "auto", vertical-output: true, display-mode: "form" }
Concepts = "" #@param ["","Yes", "No"]


In [None]:
#@title  Experiment walkthrough video? { run: "auto", vertical-output: true, display-mode: "form" }
Walkthrough = "" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [None]:
#@title  Text and image description/explanation and code comments within the experiment: { run: "auto", vertical-output: true, display-mode: "form" }
Comments = "" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [None]:
#@title Mentor Support: { run: "auto", vertical-output: true, display-mode: "form" }
Mentor_support = "" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [None]:
#@title Run this cell to submit your notebook for grading { vertical-output: true }
try:
  if submission_id:
      return_id = submit_notebook()
      if return_id : submission_id = return_id
  else:
      print("Please complete the setup first.")
except NameError:
  print ("Please complete the setup first.")