In [54]:
import pandas as pd
import os
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.probability import FreqDist
import re
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Shreeya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Shreeya/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Filtering Input
1. **Tokenization**: The word_tokenize function is applied to the pitch text, splitting it into individual characters.

2. **Filtering and Processing**: The code removes punctuation and stopwords from the tokenized pitch using regular expressions (punctuation.sub) and a list of stopwords from the NLTK library (stopwords.words('english')).

3. **Lemmatization**: The code utilizes the WordNetLemmatizer from NLTK (WordNetLemmatizer()) to lemmatize the list of words with no punctuation. Lemmatization reduces words to their root form. The resulting lemmas are stored in the filtered_pitch list.

In [42]:
pitch = "Hi, my name is shreeya kantamsetty and I am rising sophomore. I am interested in machine learning and data science. I have expertise in web development such as angular, css, and html html html html html. "

def filter_pitch (pitch): 
    pitch_tokens = word_tokenize(pitch)

    punctuation=re.compile(r'[-,.?!,:;()|0-9]')

    all_stopwords = stopwords.words('english')

    filtered_pitch_tokens = [word for word in pitch_tokens if word not in all_stopwords]

    no_punctuation = []
    for words in filtered_pitch_tokens:
        word=punctuation.sub("",words)
        if len(word)>0:
            no_punctuation.append(word)

    lemmatizer = WordNetLemmatizer()

    filtered_pitch = [lemmatizer.lemmatize(word.lower()) for word in no_punctuation]

    return filtered_pitch


filter_pitch(pitch)




['hi',
 'name',
 'shreeya',
 'kantamsetty',
 'i',
 'rising',
 'sophomore',
 'i',
 'interested',
 'machine',
 'learning',
 'data',
 'science',
 'i',
 'expertise',
 'web',
 'development',
 'angular',
 'cs',
 'html',
 'html',
 'html',
 'html',
 'html']

# Frequency Distribution

In [40]:
def find_frequency(pitch):
    fdist = FreqDist()
    for word in pitch:
        fdist[word]+=1
    overusedWords = []
    for word, frequency in fdist.items():
        if frequency > 3:
            overusedWords.append(word)
    if len(overusedWords) > 0:
        print("Here are your overused words:")
        return overusedWords
    else:
        print("You have no overused words. Great job!")


find_frequency(filter_pitch(pitch))

Here are your overused words:


['html']

# Lexical richness

##### Calculates lexical richness in terms of **total number of distinct words out of total number of words**

* Type-Token Ratio(TTR): Our ratio calculates the # of unique words in proportion to the total number of words. Generally, a TTR between 0.2 and 0.4 is considered average, while a TTR above 0.4 is often seen as more diverse and rich. 

In [47]:
def lexical_richness(pitch):
    richness = len(set(pitch))/len(pitch)
    print("Here is your lexical richness score:", richness)
    if (richness > 0.4):
        print("Your text has has lexical richness!")
    else: 
        print("You have low lexical richness. Try diversifying the ewords you are using.")

lexical_richness(pitch)

Here is your lexical richness score: 0.1323529411764706
You have low lexical richness. Try diversifying the ewords you are using.


# Sentiment Analysis

In [24]:
sia = SentimentIntensityAnalyzer()

sentiment_scores = []
for word in filtered_pitch:
    score = sia.polarity_scores(word)
    sentiment_scores.append(score['compound'])



# Roberta Pretrained Model

##### 

In [56]:
MODEL=f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [57]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(pitch, return_tensors = 'pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'Here is your roberta_neg score' : scores[0],
        'Here is your roberta_neu score' : scores[1],
        'Here is your roberta_pos score' : scores[2]
    }
    if (scores[0] > 0.2):
        print("Your text is too negative. Try adding some positive words: ")
    return scores_dict

polarity_scores_roberta(pitch)

ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.