In [None]:
import numpy as np
# to display full text
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [None]:
df = pd.read_csv('graph-datav4.csv')

In [None]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
df['user_id'].nunique()

In [None]:
df[df['user_id'] <= 2]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print(sum(df.user_id.isna()))
print(sum(df.text.isna()))
print(sum(df.favorites_count.isna()))
print(sum(df.verified.isna()))
print(sum(df.followers_count.isna()))

In [None]:
df.text.fillna('nothing to say', inplace = True)

In [None]:
print(sum(df.user_id.isna()))
print(sum(df.text.isna()))
print(sum(df.favorites_count.isna()))
print(sum(df.verified.isna()))
print(sum(df.followers_count.isna()))

In [None]:
df['user_id'].nunique()

## Word Embeddings using gensim

In [None]:
import gensim

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
text = list()
line = df['text'].values.tolist()
for l in line:
    print(l)
    token = word_tokenize(l)
    token = [t.lower() for t in token]
    punct = str.maketrans('','',string.punctuation)
    translate = [t.translate(punct) for t in token]
    words = [word for word in translate if word.isalpha()]
    stop = set(stopwords.words('english'))
    words = [t for t in words if not t in stop]
    text.append(words)

In [None]:
len(text)

In [None]:
model = gensim.models.Word2Vec(sentences = text, size = 100, window = 2, workers = 3, min_count = 1)

In [None]:
words = list(model.wv.vocab)
print('Vocab size', len(words))

In [None]:
words

## Most Similar Words

In [None]:
model.wv.most_similar('tweet')

## Save vectors

In [None]:
max_length = max([len(s.split()) for s in df.text])

In [None]:
max_length

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
file = 'twitter_charlie_manual_word2vec_graph.txt'

In [None]:
model.wv.save_word2vec_format(file,binary = False)

In [None]:
import os
emb = {}
f = open(os.path.join('', 'twitter_charlie_manual_word2vec_graph.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    value = np.asarray(values[1:])
    emb[word] = value
f.close()

In [None]:
token = Tokenizer()
token.fit_on_texts(text)
seq = token.texts_to_sequences(text)
index = token.word_index
print('unique no of tokens ', index)
padding = pad_sequences(seq, maxlen = max_length)
rumour = df['label'].values

In [None]:
padding[13]

In [None]:
n = len(index) + 1
embed_mat = np.zeros((n, 100))
for s,values in index.items():
    print(s,values)
    if values>n:
        continue
    embed_vec = emb.get(s)
    if embed_vec is not None:
        embed_mat[values] = embed_vec

In [None]:
embed_mat.shape

In [None]:
len(index)

In [None]:
len(padding)

In [None]:
sorted_words = sorted(index)
#print(sorted_words)

In [None]:
index.keys()

## Map words with their vectors

In [None]:
features = {}

In [None]:
d = np.arange(0, len(padding))

In [None]:
data = np.zeros((0, len(df)))

In [None]:
feat_df = pd.DataFrame(index = d, columns = sorted_words)
#print(feat_df)

In [None]:
feat_df.shape

In [None]:
feat_df.fillna(0.0, inplace = True)

In [None]:
feat_df.head()

In [None]:
#combining features with average
for row in range(0, len(padding)):
    print(row)
    for number in padding[row]:
        for key,value in index.items():
            if number == value:
                mean = embed_mat[number].mean()
                if (feat_df.iloc[row][key] == 0.0):
                    feat_df.loc[row,key] = mean
                
                else:
                    feat_df.loc[row,key] = feat_df.iloc[row][key] + mean
                break

In [None]:
feat_df.to_csv('features of word2vec embeddings.csv')

In [None]:
feat_df = pd.read_csv('features of word2vec embeddings.csv')
feat_df.head()

In [None]:
feat_df.info()

In [None]:
feat_df=feat_df.fillna(0)

In [None]:
feat_df.shape

## Combining Features into 1 dataframe

In [None]:
df.drop('text', axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
X_new = pd.concat([feat_df, df], axis=1)

In [None]:
X_new.shape

In [None]:
X_new.fillna(0)

In [None]:
X_new.to_csv('charlie_combined_word2vec_features_with_original_features.csv')

In [None]:
import pandas as pd
checking = pd.read_csv('charlie_combined_word2vec_features_with_original_features.csv')

In [None]:
checking['user_id'].nunique()

In [None]:
checking.shape

In [None]:
checking[checking['user_id'] == 0]