In [18]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import re
import nltk

In [19]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
def preprocess_and_embed(text: str) -> pd.DataFrame:
    # Step 1: Load the list of stop words and define essential ones
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    
    # Create a DataFrame with the input text in a column
    df = pd.DataFrame({'user_text': [text]})

    # Step 3: Convert text to lowercase
    df['text'] = df['user_text'].str.lower()

    # Step 4: Remove leading and trailing whitespace
    df['text'] = df['text'].str.strip()

    # Step 5: Replace newline characters with a space
    df['text'] = df['text'].str.replace('\n', ' ', regex=True)

    # Step 6: Calculate word count
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))

    # Step 7: Create a new column 'num_stop_words' by counting the number of stopwords in each text
    df['num_stop_words'] = df['text'].apply(lambda x: len([word for word in x.split() if word in stop_words]))

    # Step 8: Calculate the number of characters in the text
    df['num_chars'] = df['text'].apply(len)

    # Step 9: Create a new column 'num_punctuation_chars' to count punctuation characters in each text
    df['num_punctuation_chars'] = df['text'].apply(
        lambda x: sum([1 for char in x if char in '.,!?;:"\'()[]{}-'])
    )

    # Step 10: Remove non-English characters from the text column
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s!?.,]', '', str(x)))

    # Step 11: Remove stop words from 'text' column, retaining essential ones
    df['text'] = df['text'].apply(
        lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words])
    )

    # Step 12: Define the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Step 13: Apply lemmatization to the 'text' column that contains no stopwords
    df['text'] = df['text'].apply(
        lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
    )

    # Step 14: Remove URLs from the 'text' column
    df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

    # Step 15: Remove email addresses from the 'text' column
    df['text'] = df['text'].apply(lambda x: re.sub(r'\S+@\S+', '', x))

    # Step 16: Remove HTML tags from the 'text' column
    df['text'] = df['text'].apply(lambda x: re.sub(r'<.*?>', '', x))

    # Step 17: Remove special characters and numbers from the 'text' column
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

    # Step 18: Calculate word count after preprocessing
    df['word_count_after_preprocessing'] = df['text'].apply(lambda x: len(x.split()))

    # Step 19: Calculate the number of characters after preprocessing
    df['num_chars_after_preprocessing'] = df['text'].apply(len)

    # Step 20: Tokenize text and remove unnecessary characters
    df['tokenized_text'] = df['text'].apply(
        lambda text: word_tokenize(text) if isinstance(text, str) else []
    )

    # Step 21: Convert the tokenized text to a list of sentences
    sentences = df['tokenized_text'].to_list()

    # Step 22: Initialize and train the Word2Vec model
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

    # Step 23: Convert text data to Word2Vec embeddings
    df['word2vec_embedded_vector'] = df['tokenized_text'].apply(
        lambda tokens: (
            [sum(x) / len(x) if len(x) > 0 else 0 for x in zip(*[model.wv[word] for word in tokens if word in model.wv])] 
            ) if len(tokens) > 0 else [0] * model.vector_size
        )
    

    # Step 24: Drop the 'text' column
    df = df.drop(columns=['text'])

    # Step 26: Create a DataFrame from the list of vectors
    expanded_df = pd.DataFrame(df['word2vec_embedded_vector'].tolist())

    # Step 27: Rename the columns
    expanded_df.columns = [f'vector_dim_{i+1}' for i in range(expanded_df.shape[1])]

    # Step 28: Concatenate the new DataFrame with the original DataFrame
    final_df = pd.concat([df, expanded_df], axis=1)

    # Remove intermediate columns
    final_df.drop(columns=['user_text', 'tokenized_text', 'word2vec_embedded_vector'], inplace=True)

    return final_df


In [21]:
# Example usage
text_input = '''Hello Sahil !!!

I’m excited to invite you to the DataDrooler Community! 🌱 Join community here

After receiving an overwhelming response from over 1000 aspiring data professionals, I realized the need for a collaborative platform where we can connect and work on impactful data projects.

This is why I'm launching a Free Upcoming Build Cohort starting September 29th, where you can collaborate with other brilliant minds in the field!

If you’re interested, just join the community and you'll find more details there.

Looking forward to seeing you in the community!
--
Warm Regards,
Sunjana Ramana Chintala
Linkedin: linkedin.com/in/sunjana-ramana 
DataDrooler Website : www.datadrooler.com'''

In [22]:
result_df = preprocess_and_embed(text_input)

In [23]:
result_df

Unnamed: 0,word_count,num_stop_words,num_chars,num_punctuation_chars,word_count_after_preprocessing,num_chars_after_preprocessing,vector_dim_1,vector_dim_2,vector_dim_3,vector_dim_4,...,vector_dim_91,vector_dim_92,vector_dim_93,vector_dim_94,vector_dim_95,vector_dim_96,vector_dim_97,vector_dim_98,vector_dim_99,vector_dim_100
0,102,39,692,22,58,480,-0.00048,0.000438,-0.00045,0.000315,...,0.001111,0.000132,0.000522,-0.00034,0.00154,0.000667,-0.000319,1.6e-05,-0.000348,-0.000372


In [24]:
import pandas as pd
import pickle
model_file_path = 'random_forest__baseline_model.pkl'  # Change this to your actual file path
with open(model_file_path, 'rb') as model_file:
    model = pickle.load(model_file)


In [25]:
if model.predict(result_df)==1:
    print("spam")
else:
    print("ham")

spam
