# Text Data Processing

In this assignment we are writing the following 2 functions for Text Data Processing

<ul>
    <li>
        <b>preprocess:</b> Function takes in a pandas.Series() of a corpus of text data as an argument. This function should output an indexed vocabulary and preprocessed tokens.
    </li>
    <li>
        <b>encode():</b> Function that takes in two arguments: 1) a pandas.Series() (or the preprocessed token outputs of the preprocess() function), and 2) a specified encoding method. These encoding methods must include Bag-of-Words, TF-IDF, and Word2Vec. 
    </li>
</ul>

In [None]:
!pip install --upgrade pip
!pip install nltk
!pip install contractions
!pip install inflect
!pip install scikit-learn 
!pip install gensim
!pip uninstall -y tensorflow
!pip install torch
!pip install transformers

In [None]:
from platform import python_version

print(python_version())

In [None]:
from transformers import pipeline

# Specify the model
model_id = "cardiffnlp/twitter-roberta-base-sentiment-latest"

sentiment_pipe = pipeline("sentiment-analysis", model=model_id)
print(sentiment_pipe('I hate you'))

In [None]:
import pandas as pd
import numpy as np
import sklearn
from IPython.display import display, HTML

# Display Properties
from IPython.display import display, HTML
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
import nltk
import string
import re
import inflect
import contractions
from data_pipeline import Text_Pipeline

# Download the various 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Initialize various tools
text_pipeline = Text_Pipeline('CONVERT')

In [None]:
import pandas as pd 

CORPUS = [
    "The quick brown fox jumps over the lazy dog",
    "A king's strength also includes his allies",
    "History is written by the victors",
    "An apple a day keeps the doctor away",
    "Nothing happens until something moves",
    "The 10,000,303 striped bats    aren't hanging on their feet for best."
    ]

# Create a Pandas series 
s = pd.Series(CORPUS) 

# Obtain pre processed series
preprocessed_series = text_pipeline.preprocess(s)
print(preprocessed_series)

In [None]:
import pandas as pd

# Get matrix using BOW
matrix, column_names = text_pipeline.encode(preprocessed_series, 'BOW')

result = pd.DataFrame(
    data=matrix.toarray(), 
    index=preprocessed_series.values, 
    columns=column_names
)

result.head()

In [None]:
# Get matrix using TF-IDF
matrix, column_names = text_pipeline.encode(preprocessed_series, 'TFIDF')

result = pd.DataFrame(
    data=matrix.toarray(), 
    index=preprocessed_series.values, 
    columns=column_names
)

result.head()

In [None]:
# Get matrix using Word to Vector
matrix = text_pipeline.encode(preprocessed_series, 'WordToVec')

result = pd.DataFrame(
    data=matrix.vectors, 
    index=matrix.key_to_index.keys()
)

result.head()

We will now apply a model to it using Large Language Models

In [None]:
def analyze_sentiment(text):
    #sentiment_analyzer = pipeline('sentiment-analysis', model=model_id)
    result = sentiment_pipe(text)
    return result[0]['label']

In [None]:

# Analyze the sentiment of a few sentences
amazon_reviews = [
    "My kiddos liked it!",
    "Amazon, please buy the show! I'm hooked!",
]

#amazon_reviews = df1['text'].values

# Analyze sentiment for each news headline
sentiments = [analyze_sentiment(review) for review in amazon_reviews]

In [None]:
print(sentiments)

In [None]:
print(type(amazon_reviews))

In [None]:
print(type(preprocessed_series))