In [1]:
# import necessary packages
import pandas as pd
import re
import nltk
from nltk.corpus import words
from nltk.corpus import wordnet
import string
import numpy as np

In [3]:
# function to clean a list of lemmatized strings
def clean_lemmatized_strings(lemmatized_list):
    cleaned_list = []
    for text in lemmatized_list:
        cleaned_text = re.sub(r'\bx200b\b', '', text)  # remove 'x200b'
        cleaned_text = re.sub(r'\b\d+\b', '', cleaned_text)  # remove strings that are only numbers
        cleaned_text = re.sub(r'\b\d\w*\b', '', cleaned_text)  # remove strings that start with numbers
        cleaned_text = re.sub(r'\b(?:https?|www)\S*\b', '', cleaned_text)  # remove URLs starting with http, https, www
        cleaned_text = re.sub(r'\b[a-zA-Z]\b', '', cleaned_text)  # remove singular letters
        
        # only append non-empty, clean strings to the list, removing brackets and extra spaces
        cleaned_text = cleaned_text.strip().strip('[]')
        if cleaned_text:  # check if the string is non-empty
            cleaned_list.append(cleaned_text)
    
    return cleaned_list

In [5]:
# function to check if word exists in WordNet
nltk.download('wordnet')
def is_real_word(word):
    return bool(wordnet.synsets(word))  # returns True if the word exists

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sophiachung/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
nltk.download('words')
nltk_words = set(words.words())

[nltk_data] Downloading package words to
[nltk_data]     /Users/sophiachung/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
def clean_neologisms(array):
    cleaned_neo = []
    for lemma in array:
        # remove lemmas that are 2 characters or less long
        if len(lemma) > 2 and lemma not in string.punctuation:
            cleaned_neo.append(lemma)
    return np.array(cleaned_neo)

In [11]:
# regular expression to match strings that have a mix of letters and numbers
pattern = r'\b(?=.*[0-9])(?=.*[a-zA-Z])[a-zA-Z0-9]{10,}\b'

# function to remove matching patterns
def remove_url_parts(text):
    return re.sub(pattern, '', text)

In [13]:
# load contractions from the txt file
with open('contractions.txt', 'r') as file:
    contractions = file.read().split()

In [15]:
def find_neologisms(subreddit_name, nltk_words):
    path = 'tpp/r_' + subreddit_name + '_tpp.csv'
    df = pd.read_csv(path)
    df = df[['msg_lemmatized']]
    
    # ensure 'clean_lem' column can hold lists
    df_clean = pd.DataFrame()
    df_clean['clean_lem'] = df['msg_lemmatized'].apply(lambda lem: 
                                                       clean_lemmatized_strings([x.strip().replace("'", "") for x in lem.split(',')]))
    
    # ensure the clean_lem column has lists and is not empty before exploding
    df_clean = df_clean[df_clean['clean_lem'].apply(lambda x: isinstance(x, list) and len(x) > 2)]
    
    # explode the clean_lem column into separate rows
    exploded_data = df_clean.explode('clean_lem')
    # remove remaining brackets or unwanted characters after exploding
    exploded_data['clean_lem'] = exploded_data['clean_lem'].str.strip('[]').str.strip()
    exploded_data.reset_index(drop=True, inplace=True)
    
    # subtract dictionary from all tokens -> left with neologisms
    # 1st pass: filter out words that are in the NLTK dictionary
    df_neologisms = exploded_data[~exploded_data['clean_lem'].isin(nltk_words)]

    # 2nd pass: filter out words that are in NLTK WordNet
    words = df_neologisms['clean_lem'].unique()
    neologisms = [word for word in words if not is_real_word(word)]
    
    # create a new DataFrame containing only the rows where 'lemma' is in the neologisms list
    df_neologisms = df_neologisms[df_neologisms['clean_lem'].isin(neologisms)]
    df_neologisms = df_neologisms['clean_lem'].unique()

    neologisms_array = clean_neologisms(df_neologisms)

    # apply the function to each element in the numpy array
    cleaned_array = np.vectorize(remove_url_parts)(neologisms_array)
    # remove extra spaces from each string
    cleaned_array = np.char.strip(cleaned_array)

    # filter out strings that are in the contractions list
    filtered_array = [word for word in cleaned_array if word not in contractions]
    # convert the filtered list back to a numpy array if needed
    filtered_array = np.array(filtered_array)
    filtered_array = np.unique(filtered_array)

    # remove empty strings if needed
    mask = (filtered_array != '')
    final_array = filtered_array[mask]
    
    # save new DataFrame of neologisms
    output_file_path = 'neologisms/r_' + subreddit_name + '.csv'
    df_final = pd.DataFrame(final_array)
    df_final.to_csv(output_file_path, index=True)
    print("\nneologisms dataframe saved to:", output_file_path) 
    print('\nnumber of neologisms:', len(final_array))

In [17]:
subreddits = ['adops'] # create list of subreddits for lexical analysis

In [19]:
for subreddit in subreddits:
    name = subreddit.lower()
    find_neologisms(name, nltk_words)


neologisms dataframe saved to: neologisms/r_adops.csv

number of neologisms: 900
