# Data Science (NLP) internship Kudosware

## Task: Text Generation

In [1]:
# importing required liraries
import pandas as pd
import numpy as np
import re
import math
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping
from string import punctuation
import emoji

In [2]:
# Read data from the CSV file
data = pd.read_csv('data.csv', header=None, encoding='ISO-8859-1',names=['target', 'id', 'date', 'flag', 'user', 'text'])

In [3]:
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Creating a set of stop words for English
stop_words = set(stopwords.words('english'))

# Initializing the lemmatizer
lemmatizer = WordNetLemmatizer()

In [5]:
# pre-processing data
def preprocess(text):
    # Removing URLs
    text = re.sub(r'http\S+', '', text)
    
    # Removing mentions
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    
    # Removing hashtags
    text = text.replace('#', '')
    
    # Removing emojis and converting them to textual representation
    text = emoji.demojize(text)
    
    # Removing punctuation
    text = ''.join(char for char in text if char not in punctuation)
    
    # lemmatize, remove stop words, converting to lower case
    words = [lemmatizer.lemmatize(word.lower()) for word in text.split() if word.lower() not in stop_words]
    
    text = ' '.join(words)
    
    return text

In [6]:
# applying preprocess function
data['text'] = data['text'].apply(preprocess)

In [7]:
data['text']

0          awww thats bummer shoulda got david carr third...
1          upset cant update facebook texting might cry r...
2          dived many time ball managed save 50 rest go b...
3                            whole body feel itchy like fire
4                                   behaving im mad cant see
                                 ...                        
1599995                        woke school best feeling ever
1599996           thewdbcom cool hear old walt interview â«
1599997                       ready mojo makeover ask detail
1599998    happy 38th birthday boo alll time tupac amaru ...
1599999                                 happy charitytuesday
Name: text, Length: 1600000, dtype: object

In [8]:
# train test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [9]:
# applying the TF-IDF vectorizer to convert text to vectors
vectorizer = TfidfVectorizer(max_features=5000)
train_features = vectorizer.fit_transform(train_data['text'])
test_features = vectorizer.transform(test_data['text'])


In [13]:
# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(train_features, train_data['target'])

MultinomialNB()

In [14]:
# Text generation
def generate_tweet(seed_sentence, n=10):
    current_sentence = seed_sentence
    perplexity = 0
    for i in range(n):
        vectorized_sentence = vectorizer.transform([current_sentence])
        prediction = clf.predict(vectorized_sentence)[0]
        if prediction == 0:
            next_word = np.random.choice(train_data[train_data['target'] == 0]['text'])
        else:
            next_word = np.random.choice(train_data[train_data['target'] == 4]['text'])
        current_sentence += ' ' + next_word
        
        # Calculate perplexity
        prob = clf.predict_proba(vectorized_sentence)
        perplexity += math.log(prob[0][prediction])
    
    # Calculate average perplexity
    avg_perplexity = math.exp(-perplexity/n)
    return current_sentence, avg_perplexity

In [15]:
# Implementation
seed_sentence = "do you have"
generated_tweet, perplexity = generate_tweet(seed_sentence)
print("Regenerated tweet: ", generated_tweet)

Regenerated tweet:  do you have goddamnit technology hate couldnt tweet anymore dd nighttttt noooooo lol nah made plan wen annie told late studying portwoods exam tomorrow finished typing 30 recipe 1st hour made 17 page poor tree updated myspace pgi need find friend going long nightagain wont camera july 2 owww didnt think would much pain im already fucked frustrated annoyed make smile careful dnt know heat index sa supposed another triple digit day 101 dont think going make tonight girl best would proud rode river circuit asthma hit hard wet mow amazed amused wpac cr limit raised celebrated cole shop etc


In [16]:
print("Perplexity score: ", perplexity)

Perplexity score:  1.225898095534193
