<a href="https://colab.research.google.com/github/pradeep-016/NLP/blob/main/NLP_text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
!unzip /content/drive/MyDrive/Assignment/archive.zip

Archive:  /content/drive/MyDrive/Assignment/archive.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [5]:
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', header=None, encoding='ISO-8859-1',
                   names=['target', 'id', 'date', 'flag', 'user', 'text'])

In [6]:
data

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [7]:
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
def preprocess_tweet(text):
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Remove mentions
    text = re.sub(r'#', '', text) # Remove hashtags
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = ' '.join([lemmatizer.lemmatize(word.lower()) for word in text.split() if word.lower() not in stop_words]) # Tokenize, lemmatize, and remove stop words
    return text

In [9]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

unzip:  cannot find or open /usr/share/nltk_data/corpora/wordnet.zip, /usr/share/nltk_data/corpora/wordnet.zip.zip or /usr/share/nltk_data/corpora/wordnet.zip.ZIP.


In [10]:
data['text'] = data['text'].apply(preprocess_tweet)

In [11]:
data['text']

0          awww thats bummer shoulda got david carr third...
1          upset cant update facebook texting might cry r...
2          dived many time ball managed save 50 rest go b...
3                            whole body feel itchy like fire
4                                   behaving im mad cant see
                                 ...                        
1599995                        woke school best feeling ever
1599996           thewdbcom cool hear old walt interview â«
1599997                       ready mojo makeover ask detail
1599998    happy 38th birthday boo alll time tupac amaru ...
1599999                                 happy charitytuesday
Name: text, Length: 1600000, dtype: object

In [12]:
train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]

In [13]:
vectorizer = TfidfVectorizer(max_features=5000)
train_features = vectorizer.fit_transform(train_data['text'])
test_features = vectorizer.transform(test_data['text'])

In [14]:
clf = MultinomialNB()
clf.fit(train_features, train_data['target'])

In [15]:
def generate_tweet(seed_sentence, n=10):
    current_sentence = seed_sentence
    perplexity = 0
    for i in range(n):
        vectorized_sentence = vectorizer.transform([current_sentence])
        prediction = clf.predict(vectorized_sentence)[0]
        if prediction == 0:
            next_word = np.random.choice(train_data[train_data['target'] == 0]['text'])
        else:
            next_word = np.random.choice(train_data[train_data['target'] == 4]['text'])
        current_sentence += ' ' + next_word
        
        prob = clf.predict_proba(vectorized_sentence)
        perplexity += math.log(prob[0][prediction])
    
    avg_perplexity = math.exp(-perplexity/n)
    return current_sentence, avg_perplexity

In [16]:
import math
# Example usage
seed_sentence = "I am feeling"
generated_tweet, perplexity = generate_tweet(seed_sentence)
print("Generated tweet: ", generated_tweet)
print("Perplexity score: ", perplexity)

Generated tweet:  I am feeling 21 huh well dont feel day 90 miss love im exhausted couldnt go sleep like 2 cant believe theyre done thought id watch secret millionaire watch apprentice repeat ive watched one x shave closely thought say athletic tape eye extreamly red dont know javascript javabans style amazon api call possible need quotsecretquot api key sign request gah college 1045 aimee ist break humph oh dear im tight schedule
Perplexity score:  1.2427476941453446
