In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk import ngrams

# Load dataset from a CSV URL
pd.set_option('display.max_colwidth', None)
file_url = 'https://archive.org/download/nltk-corpora/tweets.20150430-223406.tweet.tokenized.stemmed.lemmatized.csv'
df_initial = pd.read_csv(file_url, usecols=['id', 'text', 'posttoken'], encoding='utf-8')
print(df_initial.info())
df = df_initial.copy()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10416 entries, 0 to 10415
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         10416 non-null  int64 
 1   text       10416 non-null  object
 2   posttoken  10416 non-null  object
dtypes: int64(1), object(2)
memory usage: 244.2+ KB
None


Unnamed: 0,id,text,posttoken
0,593891099434983425,RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,"['indirect', 'cost', 'of', 'the', 'uk', 'be', 'in', 'the', 'eu', 'be', 'estimate', 'to', 'be', 'cost', 'britain', 'billion', 'per', 'year', '#betteroffout', '#ukip']"
1,593891099548094465,VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY,"['sturgeon', 'on', 'post-election', 'deal']"
2,593891100429045760,RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1,"['the', 'ukip', 'east', 'lothian', 'candidate', 'look', 'about', 'and', 'still', 'have', 'an', 'msn', 'addy']"
3,593891101154619392,RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives,"['ed', 'milliband', 'be', 'an', 'embarrassment', 'would', 'you', 'want', 'him', 'represent', 'the', 'uk', '#bbcqt', 'vote']"
4,593891101838340096,RT @NivenJ1: “@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating do…,"['ed', 'miliband', 'prove', 'tonight', 'why', 'he', 'be', 'not', 'up', 'to', 'the', 'job', 'tbf', 'you', 'have', 'spend', 'year', 'do', 'that', 'you', 'salivate', 'do']"


In [2]:
# Function to generate N-grams as plain text
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Process corpus to get tokens and generate N-grams
corpus = df['text'].fillna('')  # Replace NaNs with empty strings
corpus_tokens = corpus.apply(lambda x: x.split())  # Split text into tokens

# Generate unigrams, bigrams, and trigrams
df['unigrams'] = corpus_tokens.apply(lambda x: generate_ngrams(x, 1))
df['bigrams'] = corpus_tokens.apply(lambda x: generate_ngrams(x, 2))
df['trigrams'] = corpus_tokens.apply(lambda x: generate_ngrams(x, 3))

# Display results for first 5 rows
df[['text', 'unigrams', 'bigrams', 'trigrams']].head()

Unnamed: 0,text,unigrams,bigrams,trigrams
0,RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP,"[(RT,), (@KirkKus:,), (Indirect,), (cost,), (of,), (the,), (UK,), (being,), (in,), (the,), (EU,), (is,), (estimated,), (to,), (be,), (costing,), (Britain,), (£170,), (billion,), (per,), (year!,), (#BetterOffOut,), (#UKIP,)]","[(RT, @KirkKus:), (@KirkKus:, Indirect), (Indirect, cost), (cost, of), (of, the), (the, UK), (UK, being), (being, in), (in, the), (the, EU), (EU, is), (is, estimated), (estimated, to), (to, be), (be, costing), (costing, Britain), (Britain, £170), (£170, billion), (billion, per), (per, year!), (year!, #BetterOffOut), (#BetterOffOut, #UKIP)]","[(RT, @KirkKus:, Indirect), (@KirkKus:, Indirect, cost), (Indirect, cost, of), (cost, of, the), (of, the, UK), (the, UK, being), (UK, being, in), (being, in, the), (in, the, EU), (the, EU, is), (EU, is, estimated), (is, estimated, to), (estimated, to, be), (to, be, costing), (be, costing, Britain), (costing, Britain, £170), (Britain, £170, billion), (£170, billion, per), (billion, per, year!), (per, year!, #BetterOffOut), (year!, #BetterOffOut, #UKIP)]"
1,VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY,"[(VIDEO:,), (Sturgeon,), (on,), (post-election,), (deals,), (http://t.co/BTJwrpbmOY,)]","[(VIDEO:, Sturgeon), (Sturgeon, on), (on, post-election), (post-election, deals), (deals, http://t.co/BTJwrpbmOY)]","[(VIDEO:, Sturgeon, on), (Sturgeon, on, post-election), (on, post-election, deals), (post-election, deals, http://t.co/BTJwrpbmOY)]"
2,RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1,"[(RT,), (@GregLauder:,), (the,), (UKIP,), (east,), (lothian,), (candidate,), (looks,), (about,), (16,), (and,), (still,), (has,), (an,), (msn,), (addy,), (http://t.co/7eIU0c5Fm1,)]","[(RT, @GregLauder:), (@GregLauder:, the), (the, UKIP), (UKIP, east), (east, lothian), (lothian, candidate), (candidate, looks), (looks, about), (about, 16), (16, and), (and, still), (still, has), (has, an), (an, msn), (msn, addy), (addy, http://t.co/7eIU0c5Fm1)]","[(RT, @GregLauder:, the), (@GregLauder:, the, UKIP), (the, UKIP, east), (UKIP, east, lothian), (east, lothian, candidate), (lothian, candidate, looks), (candidate, looks, about), (looks, about, 16), (about, 16, and), (16, and, still), (and, still, has), (still, has, an), (has, an, msn), (an, msn, addy), (msn, addy, http://t.co/7eIU0c5Fm1)]"
3,RT @joannetallis: Ed Milliband is an embarrassment. Would you want him representing the UK?! #bbcqt vote @Conservatives,"[(RT,), (@joannetallis:,), (Ed,), (Milliband,), (is,), (an,), (embarrassment.,), (Would,), (you,), (want,), (him,), (representing,), (the,), (UK?!,), (#bbcqt,), (vote,), (@Conservatives,)]","[(RT, @joannetallis:), (@joannetallis:, Ed), (Ed, Milliband), (Milliband, is), (is, an), (an, embarrassment.), (embarrassment., Would), (Would, you), (you, want), (want, him), (him, representing), (representing, the), (the, UK?!), (UK?!, #bbcqt), (#bbcqt, vote), (vote, @Conservatives)]","[(RT, @joannetallis:, Ed), (@joannetallis:, Ed, Milliband), (Ed, Milliband, is), (Milliband, is, an), (is, an, embarrassment.), (an, embarrassment., Would), (embarrassment., Would, you), (Would, you, want), (you, want, him), (want, him, representing), (him, representing, the), (representing, the, UK?!), (the, UK?!, #bbcqt), (UK?!, #bbcqt, vote), (#bbcqt, vote, @Conservatives)]"
4,RT @NivenJ1: “@George_Osborne: Ed Miliband proved tonight why he's not up to the job” Tbf you've spent 5 years doing that you salivating do…,"[(RT,), (@NivenJ1:,), (“@George_Osborne:,), (Ed,), (Miliband,), (proved,), (tonight,), (why,), (he's,), (not,), (up,), (to,), (the,), (job”,), (Tbf,), (you've,), (spent,), (5,), (years,), (doing,), (that,), (you,), (salivating,), (do…,)]","[(RT, @NivenJ1:), (@NivenJ1:, “@George_Osborne:), (“@George_Osborne:, Ed), (Ed, Miliband), (Miliband, proved), (proved, tonight), (tonight, why), (why, he's), (he's, not), (not, up), (up, to), (to, the), (the, job”), (job”, Tbf), (Tbf, you've), (you've, spent), (spent, 5), (5, years), (years, doing), (doing, that), (that, you), (you, salivating), (salivating, do…)]","[(RT, @NivenJ1:, “@George_Osborne:), (@NivenJ1:, “@George_Osborne:, Ed), (“@George_Osborne:, Ed, Miliband), (Ed, Miliband, proved), (Miliband, proved, tonight), (proved, tonight, why), (tonight, why, he's), (why, he's, not), (he's, not, up), (not, up, to), (up, to, the), (to, the, job”), (the, job”, Tbf), (job”, Tbf, you've), (Tbf, you've, spent), (you've, spent, 5), (spent, 5, years), (5, years, doing), (years, doing, that), (doing, that, you), (that, you, salivating), (you, salivating, do…)]"
