In [13]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)

### Data Import

In [8]:
df = pd.read_csv('./data/btc_tweets.csv', lineterminator='\n')
df.head()

Unnamed: 0.1,Unnamed: 0,date,screen_name,tweet,tweet_id
0,0.0,2020-10-31 23:58:52,Sarah85381260,Yworld Finance Airdrop is now Live🚀💰🏆\n\nClick...,1.3226895270187622e+18
1,1.0,2020-10-31 23:59:40,TraderKoz,What are you doing when #Bitcoin makes a new ATH?,1.3226897280182272e+18
2,2.0,2020-10-31 23:57:38,Solipsist18,#Bitcoin: BTC-USD ( $BTC-USD) UP 2.1721%! Last...,1.3226892163425935e+18
3,3.0,2020-10-31 23:58:01,Yayaya18948871,Royal Defi Airdrop is now Live🚀💰🏆\n\nClick on ...,1.3226893124213391e+18
4,4.0,2020-10-31 23:58:35,k_filippop,@BitcoinWhaIe #bitcoin,1.322689456642687e+18


In [9]:
df = df.rename(columns={'tweet':'original_tweets'})
df = df.drop(['Unnamed: 0','tweet_id'], axis=1)
df

Unnamed: 0,date,screen_name,original_tweets
0,2020-10-31 23:58:52,Sarah85381260,Yworld Finance Airdrop is now Live🚀💰🏆\n\nClick...
1,2020-10-31 23:59:40,TraderKoz,What are you doing when #Bitcoin makes a new ATH?
2,2020-10-31 23:57:38,Solipsist18,#Bitcoin: BTC-USD ( $BTC-USD) UP 2.1721%! Last...
3,2020-10-31 23:58:01,Yayaya18948871,Royal Defi Airdrop is now Live🚀💰🏆\n\nClick on ...
4,2020-10-31 23:58:35,k_filippop,@BitcoinWhaIe #bitcoin
...,...,...,...
15265,2020-10-31 10:53:04,D40five,@ModernInvest Happy #Bitcoin Birthday TMI.!!
15266,2020-10-31 10:52:01,dabildor,Ücretsiz bitcoin kazanmak ister misiniz? Telef...
15267,2020-10-31 10:54:36,arcbtc,Happy #bitcoin whitepaper day all!\n🤩🚀🌑\nhttps...
15268,2020-10-31 10:54:42,didar_bekbau,Happy Whitepaper Day all my #Bitcoin fam! 🎉🎉🎉


### Data Cleaning

In [10]:
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

In [11]:
df.head()

Unnamed: 0,date,screen_name,original_tweets,emoji_free_tweets,url_free_tweets
0,2020-10-31 23:58:52,Sarah85381260,Yworld Finance Airdrop is now Live🚀💰🏆\n\nClick...,Yworld Finance Airdrop is now Click on below l...,Yworld Finance Airdrop is now Click on below l...
1,2020-10-31 23:59:40,TraderKoz,What are you doing when #Bitcoin makes a new ATH?,What are you doing when #Bitcoin makes a new ATH?,What are you doing when #Bitcoin makes a new ATH?
2,2020-10-31 23:57:38,Solipsist18,#Bitcoin: BTC-USD ( $BTC-USD) UP 2.1721%! Last...,#Bitcoin: BTC-USD ( $BTC-USD) UP 2.1721%! Last...,#Bitcoin: BTC-USD ( $BTC-USD) UP 2.1721%! Last...
3,2020-10-31 23:58:01,Yayaya18948871,Royal Defi Airdrop is now Live🚀💰🏆\n\nClick on ...,Royal Defi Airdrop is now Click on below link ...,Royal Defi Airdrop is now Click on below link ...
4,2020-10-31 23:58:35,k_filippop,@BitcoinWhaIe #bitcoin,@BitcoinWhaIe #bitcoin,@BitcoinWhaIe #bitcoin


### Tokenization

In [16]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
import en_core_web_lg
nlp = en_core_web_lg.load()
# nlp = spacy.load('en_core_web_lg')

ModuleNotFoundError: No module named 'en_core_web_lg'

In [None]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens