<a href="https://colab.research.google.com/github/roberthouston14/GNN-Class/blob/main/Data_Cleaning/POS_Tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [13]:
import pandas as pd
import os
import re
import string
import nltk
import transformers

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Download Transformers model
tokenizer = transformers.AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = transformers.pipeline('feature-extraction', model='nlptown/bert-base-multilingual-uncased-sentiment', tokenizer=tokenizer)

# Prompt user for input file path
file_path = input("Enter file location: ")

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# # Randomly sample 10 rows from the dataset
# df = df.sample(n=10, random_state=42)

# Define a function to get the POS features for a given text
def get_pos_features(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Get the POS tags
    pos_tags = nltk.pos_tag(tokens)
    # Initialize feature counters
    num_nouns = 0
    num_adjectives = 0
    num_verbs = 0
    # Loop over the POS tags and count the number of each type of POS
    for tag in pos_tags:
        if re.match('NN.*', tag[1]):
            num_nouns += 1
        elif re.match('JJ.*', tag[1]):
            num_adjectives += 1
        elif re.match('VB.*', tag[1]):
            num_verbs += 1
    # Return the POS features as a dictionary
    return {'num_nouns': num_nouns, 'num_adjectives': num_adjectives, 'num_verbs': num_verbs}

# Apply the get_pos_features function to the 'NLP_ready_text' column
pos_features = []
for text in df['NLP_ready_text']:
    if isinstance(text, str):
        pos_features.append(get_pos_features(text))
    else:
        pos_features.append({'num_nouns': 0, 'num_adjectives': 0, 'num_verbs': 0})
pos_features = pd.DataFrame(pos_features)

# Concatenate the original DataFrame and the POS features DataFrame
df_with_pos_features = pd.concat([df.reset_index(drop=True), pos_features], axis=1)

# Save the appended file to the original directory of the source file in a CSV format with the original file name appended with "_POS_Features"
output_file_path = os.path.join(os.path.dirname(file_path), os.path.splitext(os.path.basename(file_path))[0] + '_POS_Features.csv')
df_with_pos_features.to_csv(output_file_path, index=False)

# Display the first 5 rows of the appended DataFrame
df_with_pos_features.head()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Some weights of the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enter file location: /content/drive/MyDrive/Production Datasets/Working_Gold_Data.csv


Unnamed: 0,subject,text,label,NLP_ready_text,num_nouns,num_adjectives,num_verbs
0,undeliverable mail,user mailbox exceeds allowed size : info @ are...,1,user mailbox exceeds allowed size info areejal...,59,18,11
1,Interactive Brokers: account investigation war...,"end: 0x7334, 0x595, 0x9, 0x6, 0x20, 0x68170885...",1,end 0x7334 0x595 0x9 0x6 0x20 0x68170885 0x466...,12,2,1
2,start date : 12 / 22 / 01 ; hourahead hour : 9 ;,start date : 12 / 22 / 01 ; hourahead hour : 9...,0,start date 12 22 01 hourahead hour 9 ancillary...,30,23,10
3,lose up to 19 % weight . a new weightloss is h...,"hello , i have a special offer for you . . . w...",1,hello special offer . . . want lose weight ? p...,49,20,17
4,=?iso-8859-1?Q?Today's_WeatherDirect_Forecast_...,"WeatherDirect Waterloo, Ontario, Canada subscr...",0,weatherdirect waterloo ontario canada subscrib...,104,40,16


In [None]:
!pip install spellchecker

In [None]:
!pip install pyspellchecker


In [17]:
import pandas as pd
import os
from spellchecker import SpellChecker

# Prompt user for input file path
file_path = input("Enter file location: ")

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Randomly sample 10 rows from the dataset
df = df.sample(n=10, random_state=42)

# Drop rows with NaN values in 'NLP_ready_text'
df.dropna(subset=['NLP_ready_text'], inplace=True)

# Create a SpellChecker object
spell = SpellChecker()

# Define a function to get the mispelled ratio feature for a given text
def get_spellcheck_features(text):
    # Split the text into individual words
    words = text.split()
    # Count the number of misspelled words
    num_misspelled = len(spell.unknown(words))
    # Compute the mispelled ratio
    if len(words) == 0:
        return 0
    else:
        return num_misspelled / len(words)

# Apply the get_spellcheck_features function to the 'NLP_ready_text' column
df['mispelled_ratio'] = df['NLP_ready_text'].apply(get_spellcheck_features)

# Save the appended file to the original directory of the source file in a CSV format with the original file name appended with "_Spellcheck_Features"
output_file_path = os.path.join(os.path.dirname(file_path), os.path.splitext(os.path.basename(file_path))[0] + '_Spellcheck_Features.csv')
df.to_csv(output_file_path, index=False)

# Display the first 5 rows of the appended DataFrame
df.head()


Enter file location: /content/drive/MyDrive/Production Datasets/Working_Gold_Data_POS_Features.csv


Unnamed: 0,subject,text,label,NLP_ready_text,num_nouns,num_adjectives,num_verbs,mispelled_ratio
6735,hi there friend - are you fat,"hey im finished v , i . o . x . x 25 m . g 3 o...",1,hey im finished v . . x . x 25 . g 3 pill 72 ....,58,19,12,0.13253
22453,l . a times : laser toner market in 2005,attention investors and traders : stock alert ...,1,attention investor trader stock alert updated ...,164,54,54,0.039326
34343,"ebay , be debt free by 2003 ! 8119",ebay auction news * * recommended resource spe...,1,ebay auction news recommended resource special...,191,82,46,0.009281
38771,these ebay secrets will make you money !,need financial help ? click here to find out h...,1,need financial help ? click find make thousand...,20,7,5,0.173913
35345,great investors portfoiio,mnei the best smail cap stock in 2 oo 5 just k...,1,mnei best smail cap stock 2 oo 5 keep reading ...,388,123,128,0.119289
