In [1]:
import torch
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from torchtext import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("../datasets/sentiment-analysis/tweets/tweets.csv", encoding="latin-1")
data = data.drop(columns=["ItemID", "SentimentSource"], axis=1);
train, test = train_test_split(data, test_size=0.2, random_state=42);
train.reset_index(drop=True), test.reset_index(drop=True)

(      Sentiment                                      SentimentText
 0           pos  @amyrenea omg so am I lol I fell asleep when i...
 1           neg               @Adrienne_Bailon I want a shout out 
 2           neg  @Anonymousboy03 Plans for school stuff &amp; a...
 3           neg  ... has hit a writer's block .. am loosing my ...
 4           neg  ... trying to find people I know! I`m bored, i...
 ...         ...                                                ...
 39995       pos   #robotpickuplines are so funny. check them out. 
 39996       pos  @annyo84 awh thankss.  yeah, i understand what...
 39997       pos  @AmbiguityX ohh you're in twin cities?  i luv ...
 39998       neg   Dinara lost again in Roland Garros. Why the S...
 39999       pos  *yawn* fucking time zones shit. I'm really sic...
 
 [40000 rows x 2 columns],
      Sentiment                                      SentimentText
 0          pos  @aimeesays aww i hope it does fly by because J...
 1          neg  #don

In [3]:
train.to_csv("../datasets/sentiment-analysis/tweets/tweets_train.csv", index=False);
test.to_csv("../datasets/sentiment-analysis/tweets/tweets_test.csv", index=False);

In [4]:
import spacy
import re
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner']);

def clean_tweets(text):
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text);
    text = re.sub(r'https?:/\/\/S+', ' ', text);
    return text.strip();

def tokenizer(text):
    return [w.text.lower() for w in nlp(clean_tweets(text))];

In [5]:
TEXT = Field(tokenize = tokenizer);
LABEL = LabelField(dtype = torch.float);

datafields = [("Sentiment",LABEL), ("SentimentText",TEXT)];
trn, tst = TabularDataset.splits(path="../datasets/sentiment-analysis/tweets/",
                                train = "tweets_train.csv",
                                test = "tweets_test.csv",
                                format = "csv",
                                skip_header = True,
                                fields = datafields);



In [6]:
TEXT.build_vocab(trn, max_size=25000,
                vectors='glove.6B.100d',
                unk_init=torch.Tensor.normal_);
vocab_size = len(TEXT.vocab);
LABEL.build_vocab(trn);

In [7]:
print(TEXT.vocab.freqs.most_common(50));

[('i', 25644), ('the', 12219), ('to', 12111), ('you', 10723), ('a', 9197), ('it', 8440), ('and', 6889), ('my', 6208), ('quot', 5582), ('s', 5565), ('that', 5306), ('is', 5203), ('for', 4971), ('in', 4852), ('t', 4844), ('m', 4683), ('me', 4588), ('of', 4331), ('on', 3918), ('have', 3752), ('so', 3612), ('but', 3506), ('be', 2932), ('not', 2887), ('was', 2775), ('just', 2724), ('can', 2523), ('do', 2418), ('are', 2351), ('your', 2320), ('with', 2269), ('good', 2203), ('like', 2173), ('at', 2131), ('no', 2119), ('this', 2094), ('all', 2069), ('up', 2066), ('now', 2063), ('get', 2044), ('we', 1988), ('u', 1890), ('love', 1885), ('lol', 1864), ('too', 1826), ('what', 1760), ('out', 1742), ('know', 1664), ('nt', 1608), ('amp', 1539)]


In [8]:
print(TEXT.vocab.itos[0:10]);

['<unk>', '<pad>', 'i', 'the', 'to', 'you', 'a', 'it', 'and', 'my']


In [9]:
batch_size = 64;
train_itr, test_itr = BucketIterator.splits((trn, tst),
                                            batch_size = batch_size,
                                            sort_key = lambda x: len(x.SentimentText),
                                            #sort=False,
                                            #shuffle=True,
                                            sort_within_batch=False);

In [11]:
import torch.nn as nn

class SentimentAnalysisModel(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, 
                     output_dim, n_layers, bidirectional, drpout):
        super().__init__();
        
        
    def forward(self, sms_text):
        pass

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 9)