In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({"col": [1,2,3]})
df

Unnamed: 0,col
0,1
1,2
2,3


In [3]:
type(df['col'])

pandas.core.series.Series

In [4]:
l = []
l.items()

AttributeError: 'list' object has no attribute 'items'

In [28]:
import nltk
from nltk.collocations import *
from nltk import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

nltk.download('stopwords')
nltk.download('punkt')
stopset = set(stopwords.words('english'))


stops = [word for word in stopwords.words('english')]
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

def _stopwords(extend_stopwords=None) -> list:
    if type(extend_stopwords) == list:
        stops.extend(extend_stopwords)
    if type(extend_stopwords) == str:
        stops.append(extend_stopwords)
    
    return stops
    
def _collocations(text:str) -> str:
    """This is a helper function that creates bigrams out of the text.
    
    Collocations are bigrams that are paired together based on a
    similarity score. This is an effort to construct useful bigrams.
    
        Args:
            text: string that needs to be turned into bigrams
        
        Returns:
            str: returns the bigrams if found or returns the unigram text
    """
    filter_stops = lambda w: len(w) < 3 or w in stops
    uncovered_words = [word for word in word_tokenize(text) if word.lower() not in stops]
    finder = BigramCollocationFinder.from_words(uncovered_words)
    finder.apply_word_filter(filter_stops)
    bigram = finder.nbest(bigram_measures.pmi, 1)
    
    if not bigram:
        return text
    
    return f"{bigram[0][0]}_{bigram[0][1]}"

def _create_bigrams(col: pd.Series) -> list:
    """ Helper function to take the column of text and return the bigrams 
    based on the input text
    
    Args:
        df: column of text such as df['input_text']
    
    Returns:
        bigrams: list of bigrams and in some cases unigrams (when bigrams don't exist)
    """
    
    if type(col) != pd.core.series.Series:
        raise TypeError("The parameter passed must be a Series object.Pass the column you wish to create n-grams out of.")
    try:
        bigrams = []
        for index, value in col.items():
            bigram = _collocations(value)
            bigrams.append(bigram)

        return bigrams
    except Exception as e:
        print(e)
    
def runner(col: pd.Series, stopwords=None):
    _stopwords(stopwords)
    return _create_bigrams(col)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pratyushsingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pratyushsingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
df = pd.DataFrame({"input_text": ["my name is prat", "hello my balance is low"]})
runner(df['input_text'])

['name_prat', 'balance_low']

In [11]:
type(df['input_text'])

pandas.core.series.Series