# General Imports and Downloads

In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import nltk
from nltk.corpus import twitter_samples

In [85]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/rishushrivastava/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rishushrivastava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import re

# Load the data

In [5]:
def load_tweet():
    '''
        Load the positive and negative tweets
    '''
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    
    return positive_tweets, negative_tweets

In [6]:
positive_tweets, negative_tweets = load_tweet()

print(f'Positive Tweets length: {len(positive_tweets)}')
print(f'Negative Tweets length: {len(negative_tweets)}')

Positive Tweets length: 5000
Negative Tweets length: 5000


In [27]:
## splitting the positive and negative tweets in 80:20 split

def split_pos_neg_tweets(pos_tweets, neg_tweets, split=0.8):
    '''
        Splits the positive and negative tweets and returns training and val_test datasets
    '''
    
    max_train_rows = int(len(pos_tweets) * split)
    
    print(f'Splitting the dataset in the ratio: {split}')
    
    train_pos = pos_tweets[:max_train_rows]
    val_pos = pos_tweets[max_train_rows:]
    
    train_neg = neg_tweets[:max_train_rows]
    val_neg = neg_tweets[max_train_rows:]
    
    train_label = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
    val_label = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))
    
    print(f'Total Training Rows (pos+neg) : {len(train_pos + train_neg)}')
    print(f'Total Validation Rows (pos+neg): {len(val_pos + val_neg)}')
    
    return train_pos + train_neg , val_pos + val_neg, train_label, val_label

In [28]:
train_data, val_data, train_label, val_label = split_pos_neg_tweets(positive_tweets, negative_tweets)

Splitting the dataset in the ratio: 0.8
Total Training Rows (pos+neg) : 8000
Total Validation Rows (pos+neg): 2000


In [38]:
print(f'Sample training data   : {train_data[6000]}')
print(f'Sample training label  : {train_label[6000]}')

Sample training data   : @stormieraae what the heck :( you don't follow her?
Sample training label  : 0.0


In [37]:
print(f'Sample training data  : {train_data[0]}')
print(f'Sample training label : {train_label[0]}')

Sample training data  : #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Sample training label : 1.0


# Tweet Transformation

In [77]:
stemmer = PorterStemmer()

def tweet_transform(tweets):
    '''
        Tokenize, remove stopword, remove hashtags and usernames, stem the words from tweets
    '''
    
    stop_words = stopwords.words('english')
    
    tweet = re.sub(r'#','',tweets)
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet) ## remove any hyperlinks
    tweet = re.sub(r'^RT[\s]+','',tweet) ## remove any Retweets (RT)
    
    tokenizer = TweetTokenizer(preserve_case=True, reduce_len=False)
    tweet_tokenise = tokenizer.tokenize(tweet)
    
    cleaned_tweets = []
    
    for t in tweet_tokenise:
        if t not in stop_words and t[0] != '@': ## ignore stopwords and usernames
            stemmed_word = stemmer.stem(t) ## stem the words
            cleaned_tweets.append(stemmed_word)
    
    return cleaned_tweets
    

In [78]:
print(f'Original Tweet: \n {train_data[0]} \n')
print(f'Transformed Tweet: \n {tweet_transform(train_data[0])}')

Original Tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :) 

Transformed Tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [79]:
print(f'Original Tweet: \n {train_data[6000]} \n')
print(f'Transformed Tweet: \n {tweet_transform(train_data[6000])}')

Original Tweet: 
 @stormieraae what the heck :( you don't follow her? 

Transformed Tweet: 
 ['heck', ':(', 'follow', '?']


# Creating word vocabulary

In [96]:
def tweet_vocab(tweets):
    '''
        The vocabulary of the tweet.
    '''
    
    vocab = {'__PAD__':0, '__</e>__':1, '__UNK__':2}
    
    for tweet in tweets:
        
        processed_tweet = tweet_transform(tweet)
        
        for word in processed_tweet:
            if word not in vocab:
                vocab[word] = len(vocab)
    
    return vocab

In [97]:
vocab = tweet_vocab(train_data)

print(f'Total vocabulary : {len(vocab)}')

Total vocabulary : 9422


In [130]:
train_df = pd.DataFrame({"data":train_data,"label":train_label})
#val_df = pd.DataFrame({"data":tweet_transform(val_data), "label":val_label})

In [131]:
train_df2 = train_df.copy()

In [132]:
train_df2['transformed_data'] = train_df2['data'].apply(lambda x: tweet_transform(x))

In [145]:
train_df2['transformed_data'].tolist()

list

# Convert to Tensor + Generator

In [114]:
def tweet_to_tensor(tweet, vocab, unknown_token = '__UNK__', verbose=False):
    '''
        Converts a tweet to tensors
    '''
    
    tensor = []
    processed_tweet = tweet_transform(tweet)
    UNK_id = vocab.get(unknown_token)
    
    if verbose:
        print(f'List of Processed Tweets')
        print(processed_tweet)
    
    for word in processed_tweet:
        tensor.append(vocab.get(word,UNK_id))
        
    return tensor

In [115]:
print(f'Actual Tweet : \n {val_data[0]}')
print(f'Tensor: {tweet_to_tensor(val_data[0],vocab,verbose=False)}')

Actual Tweet : 
 Bro:U wan cut hair anot,ur hair long Liao bo
Me:since ord liao,take it easy lor treat as save $ leave it longer :)
Bro:LOL Sibei xialan
Tensor: [1146, 204, 402, 527, 2478, 808, 8447, 72, 1208, 808, 62, 2, 2803, 1905, 204, 856, 2, 2, 72, 386, 654, 2, 3656, 1096, 650, 4761, 9, 1146, 204, 177, 2, 2]


In [148]:
train_df2['transformed_data_tensor'] = train_df2['data'].apply(lambda x: tweet_to_tensor(x,vocab))

In [169]:
torch.tensor(train_df2['transformed_data_tensor'].values[0])

tensor([3, 4, 5, 6, 7, 8, 9])

In [138]:
def data_generator(pos_data, neg_data, vocab, batch_size=10, shuffle=False):
    '''
        Generates the processed tensor tweets
        Inputs:
            - pos_data : list of positive tweets
            - neg_data : list of negative tweets
            - vocab    : vocabulary generated above
            - batch_size: number of items to be generated
            - shuffle  : whether the items needs to be shuffled.
        Output:
            - generated output
    '''
    
    len_pos_data = len(pos_data) ## total number of positive tweets
    len_neg_data = len(neg_data) ## total number of negative tweets
    
    pos_index = 0
    neg_index = 0
    
    pass

In [171]:
class TweetDataset(Dataset):
    
    def __init__(self, data, labels, transform = None, train=True):
        
        self.data = data
        self.transform = transform
        self.train = train
        self.X_label = label[0]
        self.y_label = label[1]
        
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        X = self.data[self.X_label]
        
        pass