# Data Clean

In [1]:
import re
import sys
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

sys.path.append("../") 
from personal_library.NLP.preprocess import (
    remove_pattern,
    rm_pun_num_esp_cha,
    rm_length_word,
    tokenize,
    stemmer,
    join_tokenize,
    hashtag_extract,
    count_caps,
    hashtag_rm
)

from personal_library.NLP.data_analysis import(
    plot_labels_wordcloud,
    plot_hashtag_hist,
)

from personal_library.NLP.core.model_preprocessors import (
    corpus2vec,
    standard_word2vec_size
)

from gensim.models import KeyedVectors
from sklearn.manifold import TSNE

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Path to train and test files
train_path = '../data/train_E6oV3lV.csv'
test_path = '../data/test_tweets_anuFYb8.csv'

train  = pd.read_csv(train_path)
test = pd.read_csv(test_path)

all_data = train.append(test, ignore_index=True, sort=True)

train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
all_data['tidy_tweet'] = np.vectorize(remove_pattern)(all_data['tweet'], "@[\w]*")
all_data['tidy_tweet'] = rm_pun_num_esp_cha(all_data['tidy_tweet'])
all_data['tidy_tweet'] = rm_length_word(all_data['tidy_tweet'])
tokenized_tweet = tokenize(all_data['tidy_tweet'])
# tokenized_tweet = stemmer(tokenized_tweet)
all_data['tidy_tweet'] = join_tokenize(tokenized_tweet)
all_data['hashtag'] = hashtag_extract(all_data['tidy_tweet'], flatten=False)
all_data['tidy_tweet'] = np.vectorize(remove_pattern)(all_data['tidy_tweet'], "#[\w]*")

In [4]:
tokenized_tweet = tokenize(all_data['tidy_tweet'])
all_data.fillna('test', inplace = True)
all_data.to_csv('../data/pandas_data_frame.csv')
all_data["Name Length"] = all_data['tidy_tweet'].str.len()
all_data.head()

Unnamed: 0,id,label,tweet,tidy_tweet,hashtag,Name Length
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...,[run],62
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks credit cause they offer wheelchair vans,"[lyft, disapointed, getthanked]",49
2,3,0,bihday your majesty,bihday your majesty,[],19
3,4,0,#model i love u take with u all the time in ...,love take with time,[model],20
4,5,0,factsguide: society now #motivation,factsguide society,[motivation],19


In [5]:
all_data.tail()

Unnamed: 0,id,label,tweet,tidy_tweet,hashtag,Name Length
49154,49155,test,thought factory: left-right polarisation! #tru...,thought factory left right polarisation,"[trump, uselections, leadership, politics, bre...",45
49155,49156,test,feeling like a mermaid ð #hairflip #neverre...,feeling like mermaid,"[hairflip, neverready, formal, wedding, gown, ...",27
49156,49157,test,#hillary #campaigned today in #ohio((omg)) &am...,today used words like assets liability neve...,"[hillary, campaigned, ohio, clinton, radicaliz...",64
49157,49158,test,"happy, at work conference: right mindset leads...",happy work conference right mindset leads cult...,"[work, mindset]",77
49158,49159,test,"my song ""so glad"" free download! #shoegaze ...",song glad free download,"[shoegaze, newmusic, newsong]",26


In [6]:
print("Median: ", all_data["Name Length"].median())
print("Media: ", all_data["Name Length"].mean())
print("Std: ", all_data["Name Length"].std())
print("MAx: ", all_data["Name Length"].max())

Median:  33.0
Media:  37.46933420126528
Std:  22.043034003692348
MAx:  119


In [10]:
np.count_nonzero(all_data['label'] == 0.0)

29720

# Word2Vec corpus Transformation

In [14]:
# Load word2vec Model
en_model = KeyedVectors.load_word2vec_format('../data/vectors/cc.en.300.vec')

In [None]:
import timeit

start = timeit.timeit()

test = corpus2vec(en_model, tokenized_tweet, use_next=True, debug=True)

end = timeit.timeit()
print(end - start)

np.save('../data/numpy_vectorize_data', test)
print(test.shape)

### Clean data and make dataset of same length, use MAX

In [329]:
%reload_ext autoreload

data = np.load('../data/numpy_vectorize_data.npy')
clean_data = standard_word2vec_size(data, 300)

# Clean and save train and test data

In [330]:
test = clean_data[all_data['label']=='test']
y = all_data['label'][(all_data['label']==1) | (all_data['label']==0)]
X = clean_data[(all_data['label']==1) | (all_data['label']==0)]

np.save('../data/dataset/X', X)
np.save('../data/dataset/y', y)
np.save('../data/dataset/test', test)

print(y.shape)
print(X.shape)

(31962,)
(31962, 19, 300)
