In [14]:
import pandas as pd
import numpy as np
import nltk

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe

In [15]:
# load google news word2vec
import gensim.downloader as api
# w2v = api.load('word2vec-google-news-300')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
dataset = pd.read_csv('Data/sentiment140.csv')
dataset

Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by ...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Man...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all....",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
...,...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,4,NO_QUERY
1599996,TheWDB.com - Very cool to hear old Walt interv...,Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,4,NO_QUERY
1599997,Are you ready for your MoJo Makeover? Ask me f...,Tue Jun 16 08:40:49 PDT 2009,bpbabe,4,NO_QUERY
1599998,Happy 38th Birthday to my boo of alll time!!! ...,Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,4,NO_QUERY


In [17]:
#remove web addresses, signs
#change to lowercase

import re
import string

def clean_en_text(text):

    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '<URL>', text)
    text = re.sub('@\S+', '<MENTION>', text)
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)

    # removing sign
    text = ''.join([i for i in text if ord(i) not in [33, 34, 35, 36, 37, 38,
                                                      39, 40, 41, 42, 43, 44,
                                                      45, 46, 47, 58, 59, 60,
                                                      61, 62, 63, 64, 91, 92,
                                                      93, 94, 95, 96, 123, 124,
                                                      125, 126, 1548, 1567]])
    return text

dataset['text'] = dataset['text'].apply(clean_en_text)

In [18]:
# Tokenizing

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
dataset['text'] = dataset['text'].apply(lambda x: tokenizer.tokenize(x))

Unnamed: 0,text,date,user,sentiment,query
0,"[MENTION, URL, awww, that, s, a, bummer, you, ...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,"[is, upset, that, he, can, t, update, his, fac...",Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,"[MENTION, i, dived, many, times, for, the, bal...",Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,"[my, whole, body, feels, itchy, and, like, its...",Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"[MENTION, no, it, s, not, behaving, at, all, i...",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
...,...,...,...,...,...
1599995,"[just, woke, up, having, no, school, is, the, ...",Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,4,NO_QUERY
1599996,"[thewdb, com, very, cool, to, hear, old, walt,...",Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,4,NO_QUERY
1599997,"[are, you, ready, for, your, mojo, makeover, a...",Tue Jun 16 08:40:49 PDT 2009,bpbabe,4,NO_QUERY
1599998,"[happy, birthday, to, my, boo, of, alll, time,...",Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,4,NO_QUERY


In [19]:
#Lemmatizing
from nltk.stem import WordNetLemmatizer

def lemmatizer(text_list):
    lemm = WordNetLemmatizer()
    words = [lemm.lemmatize(word) for word in text_list]
    return words


dataset['text'] = dataset['text'].apply(lambda x: lemmatizer(x))

Unnamed: 0,text,date,user,sentiment,query
0,"[MENTION, URL, awww, that, s, a, bummer, you, ...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,"[is, upset, that, he, can, t, update, his, fac...",Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,"[MENTION, i, dived, many, time, for, the, ball...",Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,"[my, whole, body, feel, itchy, and, like, it, ...",Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"[MENTION, no, it, s, not, behaving, at, all, i...",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
...,...,...,...,...,...
1599995,"[just, woke, up, having, no, school, is, the, ...",Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,4,NO_QUERY
1599996,"[thewdb, com, very, cool, to, hear, old, walt,...",Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,4,NO_QUERY
1599997,"[are, you, ready, for, your, mojo, makeover, a...",Tue Jun 16 08:40:49 PDT 2009,bpbabe,4,NO_QUERY
1599998,"[happy, birthday, to, my, boo, of, alll, time,...",Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,4,NO_QUERY


In [20]:
#Showing 20 first twits
dataset['text'][0:20]

0     [MENTION, URL, awww, that, s, a, bummer, you, ...
1     [is, upset, that, he, can, t, update, his, fac...
2     [MENTION, i, dived, many, time, for, the, ball...
3     [my, whole, body, feel, itchy, and, like, it, ...
4     [MENTION, no, it, s, not, behaving, at, all, i...
5                      [MENTION, not, the, whole, crew]
6                                        [need, a, hug]
7     [MENTION, hey, long, time, no, see, yes, rain,...
8              [MENTION, nope, they, didn, t, have, it]
9                             [MENTION, que, me, muera]
10     [spring, break, in, plain, city, it, s, snowing]
11                      [i, just, re, pierced, my, ear]
12    [MENTION, i, couldn, t, bear, to, watch, it, a...
13    [MENTION, it, it, count, idk, why, i, did, eit...
14    [MENTION, i, would, ve, been, the, first, but,...
15    [MENTION, i, wish, i, got, to, watch, it, with...
16    [hollis, death, scene, will, hurt, me, severel...
17                               [about, to, fil

In [21]:
#train, test separation
from sklearn.model_selection import train_test_split

dataset = dataset.drop(columns=['query', 'user', 'date'])
x_train, x_test, y_train, y_test = train_test_split(dataset['text'], dataset['sentiment'], train_size=0.8, random_state=42)

In [34]:
def create_vocabulary(all_text):
    vocabulary = {'PAD': 0, 'END': 1, 'UNK': 2}

    for row in all_text:
        for word in row:
            if word not in vocabulary.keys():
                vocabulary[word] = len(vocabulary.keys())
    return vocabulary

In [23]:
#training Word2Vec with x_train
from gensim.models import Word2Vec
model = Word2Vec(vector_size=128, window=5, sg=0)