# Libraries

In [1]:
import spacy
from text_processing import *
from datetime import datetime
import pickle
import numpy as np
import emoji
from spacymoji import Emoji
from oauth2client.service_account import ServiceAccountCredentials
import gspread
import pandas as pd
import re
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from nltk.stem.snowball import SnowballStemmer
import warnings
warnings.filterwarnings('ignore')
import joblib
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from collections import Counter

In [2]:
RUNTIME = str(datetime.today())
RUNTIME = RUNTIME.replace('-', '').replace(' ', '').replace(':', '')[0:-13]

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Spacy Settings

In [4]:
nlp = spacy.load("en")
emoji_pipeline = Emoji(nlp)
nlp.add_pipe(emoji_pipeline, first=True)
stemmer = SnowballStemmer(language='english')

# Read In Data

## Functions to Read Data

In [5]:
SCOPES = ['https://spreadsheets.google.com/feeds',
          'https://www.googleapis.com/auth/drive']
LABELED_NEWS = '1ZSZ1m4qlBRSkXT2Hjhv2lYNFHlSbfjw5r4cRIbDi1Rk'
CREDENTIALS = 'google-credentials.json'
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, SCOPES)
client = gspread.authorize(creds)

In [6]:
def read_sheet(sheet_id, sheet_number):
    sheet = client.open_by_key(sheet_id).get_worksheet(sheet_number)
    df = pd.DataFrame(sheet.get_all_records())
    df.to_csv('labeled_data.csv')
    df = pd.read_csv('labeled_data.csv')
    os.remove('labeled_data.csv')
    df = df.drop('Unnamed: 0', axis=1)
#     df.dropna(subset=['title'], inplace=True)
    
    return df[['title', 'label']]

In [7]:
df_train = read_sheet(LABELED_NEWS, 0)
# df_train.dropna(inplace=True)

In [8]:
df_test = read_sheet(LABELED_NEWS, 1)

## Create Train / Test Sets

In [9]:
VOCAB_SIZE = 500

In [10]:
encoder = LabelEncoder()

In [11]:
X = df_train.loc[:, df_train.columns != 'sentiment']
y = df_train['label']

# Custom Transformers

## Common Word Adjustments

In [12]:
class CommonWordAdjustments(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def common_word_adjustments(self, title):
        title = title.lower()
        
        # Words for increases
        for word in ['increases', 'jumps', 'soars', 'climb', 'climbs', 'rallies', 'surges',
                    'passes', 'rises', 'emerging']:
            title = title.replace(word, ' __INCREASES__ ')
        
        # Words for decreases
        for word in ['falls', 'declines', 'decreases']:
            title = title.replace(word, ' __DECREAES__ ')

        return title
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_['cleaned_title'] = X_['title'].apply(self.common_word_adjustments)
        
        
        
        return X_

## Remove URLs

In [13]:
class RemoveURLs(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def remove_urls(self, title):
        cleaned_title = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' URL ', title)
        cleaned_title = cleaned_title.lower()
        return cleaned_title

    def transform(self, X, y=None):
        X_ = X.copy()
        X_['cleaned_title'] = X_['cleaned_title'].apply(self.remove_urls)
        
        return X_

## Remove Stopwords

In [14]:
class RemoveStopwords(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def remove_stopwords(self, title):
        title = title.replace('\n', '')
        try:
            text = nlp(title)
        except:
            return title
        cleaned_title = [token.orth_.lower() for token in text if not token.is_stop]
        return ' '.join(cleaned_title)

    def transform(self, X, y=None):
        X_ = X.copy()
        X_['cleaned_title'] = X_['cleaned_title'].apply(self.remove_stopwords)
        
        return X_

## Remove Punctuation

In [15]:
class RemovePunctuation(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def remove_punctuation(self, title):
        title = title.replace('\n', '')
        title = title.replace('$', '')
        try:
            text = nlp(title)
        except:
            return title
        cleaned_title = [token.orth_.lower() for token in text if not token.is_punct]
        return ' '.join(cleaned_title)

    def transform(self, X, y=None):
        X_ = X.copy()
        X_['cleaned_title'] = X_['cleaned_title'].apply(self.remove_punctuation)

        return X_

## Remove Number

In [16]:
class RemoveNumbers(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def remove_numbers(self, title):
        
        # Other numbers
        numbers = re.findall(r'\d+(?:,\d+)?', title)
        for number in numbers:
            title = title.replace(number, 'NUMBER ')

        return title

    def transform(self, X, y=None):

        X_ = X.copy()
        X_['cleaned_title'] = X_['cleaned_title'].apply(self.remove_numbers)
        
        return X_
    


## Lemmatize

In [17]:
class Lemmatize(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def lemmatize(self, title):
        try:
            text = nlp(title)
        except:
            return title
        return ' '.join([token.lemma_ for token in text])

    def transform(self, X, y=None):

        X_ = X.copy()
        X_['cleaned_title'] = X_['cleaned_title'].apply(self.lemmatize)
        
        return X_

## Stem

In [18]:
class Stem(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def stem(self, title):
        text = title.split()
        cleaned_title = []
        for token in text:
            cleaned_title.append(stemmer.stem(token))

        return ' '.join(cleaned_title)


    def transform(self, X, y=None):

        X_ = X.copy()
        X_['cleaned_title'] = X_['cleaned_title'].apply(self.stem)
        
        return X_

## Feature Selector

In [19]:
class FeatureSelector(BaseEstimator, TransformerMixin):

    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.feature_names]

## Export

In [20]:
class Export(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.to_csv('processed_text.csv')
        return X

## Tokenize

In [21]:
class TokenizeText(BaseEstimator, TransformerMixin):

    def __init__(self, mode='binary', vocab_size=200):
        self.mode = mode
        self.vocab_size = vocab_size

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Copy the df
        X_ = X.copy()
        
        # Load tokenizer
        with open(f'tokenizer.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle)
        
        X_ = tokenizer.transform(X_, self.mode)
        
        return X_

## SKLearn

In [22]:
class TokenizeText(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Copy the df
        X_ = X.copy()
        
        # Load tokenizer
        with open(f'tokenizer.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle)
        
        X_ = tokenizer.transform(X_)
        
        return X_

# Pipeline

## Preprocessing Pipeline for Tokenizer

In [23]:
preprocessing_pipeline = Pipeline(
    steps=[
        ('change_common_words', CommonWordAdjustments()),
        ('remove_urls', RemoveURLs()),
        ('remove_numbers', RemoveNumbers()),
        ('remove_punctuation', RemovePunctuation()),
        ('remove_stopwords', RemoveStopwords()),
        ('stem', Stem()),
        ('lemmatize', Lemmatize()),
        ('export', Export()),
        ('drop_unneeded_features', FeatureSelector('cleaned_title')),
    ]
)

## Full Pipeline

In [24]:
full_pipeline = Pipeline(
    steps=[
        ('change_common_words', CommonWordAdjustments()),
        ('remove_urls', RemoveURLs()),
        ('remove_numbers', RemoveNumbers()),
        ('remove_punctuation', RemovePunctuation()),
        ('remove_stopwords', RemoveStopwords()),
        ('stem', Stem()),
        ('lemmatize', Lemmatize()),
        ('drop_unneeded_features', FeatureSelector('cleaned_title')),
        ('tokenize', TokenizeText()),
        ('model', LogisticRegression())
    ]
)

## Preprocess Text

In [25]:
preprocessing_pipeline.fit(X, y)

Pipeline(steps=[('change_common_words', CommonWordAdjustments()),
                ('remove_urls', RemoveURLs()),
                ('remove_numbers', RemoveNumbers()),
                ('remove_punctuation', RemovePunctuation()),
                ('remove_stopwords', RemoveStopwords()), ('stem', Stem()),
                ('lemmatize', Lemmatize()), ('export', Export()),
                ('drop_unneeded_features',
                 FeatureSelector(feature_names='cleaned_title'))])

## Tokenize Text

In [26]:
processed_text = pd.read_csv('processed_text.csv')

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
processed_text = pd.read_csv('processed_text.csv')
text = processed_text['cleaned_title']
vectorizer = CountVectorizer(ngram_range=(1,1))
vectorizer.fit_transform(text)

with open(f'tokenizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [28]:
# text = processed_text['cleaned_title']
# tokenizer = Tokenizer(num_words=VOCAB_SIZE, lower=True, oov_token=True)
# tokenizer.fit_on_texts(text)
# text = tokenizer.texts_to_matrix(text, 'count')

In [29]:
# with open(f'tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Full Pipeline

In [30]:
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [1, 3]
}

In [31]:
full_pipeline.fit(X, y)
# full_pipeline = GridSearchCV(full_pipeline, param_grid)
# full_pipeline.fit(X, y)

Pipeline(steps=[('change_common_words', CommonWordAdjustments()),
                ('remove_urls', RemoveURLs()),
                ('remove_numbers', RemoveNumbers()),
                ('remove_punctuation', RemovePunctuation()),
                ('remove_stopwords', RemoveStopwords()), ('stem', Stem()),
                ('lemmatize', Lemmatize()),
                ('drop_unneeded_features',
                 FeatureSelector(feature_names='cleaned_title')),
                ('tokenize', TokenizeText()), ('model', LogisticRegression())])

# Predict - Evaluation Set

In [32]:
label_maps = {0: -1, 1: 0, 2: 1}
eval_preds = full_pipeline.predict(X)

In [33]:
Counter(eval_preds)

Counter({-1: 470, 0: 1191, 1: 645})

In [34]:
print (classification_report(eval_preds, y))

              precision    recall  f1-score   support

          -1       0.95      0.98      0.97       470
           0       0.98      0.97      0.97      1191
           1       0.96      0.96      0.96       645

    accuracy                           0.97      2306
   macro avg       0.96      0.97      0.96      2306
weighted avg       0.97      0.97      0.97      2306



In [35]:
df_train['label'].value_counts()

 0    1179
 1     646
-1     481
Name: label, dtype: int64

In [36]:
df_train['preds'] = eval_preds
df_train[df_train['preds'] != df_train['label']]

Unnamed: 0,title,label,preds
36,Bitcoin Bulls Falter Despite Gaining 14% In July,-1,0
45,PayPal started recruiting employees to work with cryptocurrency,1,0
53,Square isn't lighting it up on bitcoin this year,-1,0
60,Dolphin Stock Booms on Deal with FTX Exchange,1,0
118,5 Ultra-Popular Stocks to Avoid Like the Plague in August,-1,0
170,"Hive Blockchain Has Purchased Over 10,000 Bitcoin Miners This Year",1,0
230,How Bitcoin Solves The Store Of Value Problem,1,0
246,"Top 5 cryptocurrencies to watch this week: BTC, UNI, LINK, SOL, XMR",1,0
320,Why Creator Coins May Be The Next Big Thing For Entrepreneurs,1,0
365,How does fake news influence blockchain technology?,-1,0


# Export Model

In [37]:
joblib.dump(full_pipeline, f'news_bow_model_{RUNTIME}.joblib')

['news_bow_model_20210828.joblib']

# Test Set

In [38]:
loaded_model = joblib.load(f'news_bow_model_{RUNTIME}.joblib')

In [39]:
def predict_post(post, return_option='prediction'):
    df = pd.DataFrame(data={'title': [post]})
    
    probabilities = loaded_model.predict_proba(df)[0]
    confidence = np.max(probabilities, axis=0)
    prediction = label_maps[np.argmax(probabilities, axis=0)]
    
    return prediction

In [40]:
# df_train['prediction'] = df_train['title'].apply(predict_post)

In [41]:
df_test.dropna(inplace=True)

In [42]:
df_test['prediction'] = df_test['title'].apply(predict_post)

In [43]:
print(classification_report(df_test['prediction'], df_test['label']))

              precision    recall  f1-score   support

          -1       0.47      0.65      0.55        43
           0       0.81      0.73      0.76       241
           1       0.60      0.64      0.62       113

    accuracy                           0.69       397
   macro avg       0.63      0.67      0.64       397
weighted avg       0.71      0.69      0.70       397



In [44]:
df_test[df_test['prediction'] != df_test['label']]

Unnamed: 0,title,label,prediction
1,Immutable’s Guild of the Guardians NFT game teams up with NRG Esports,0,1
24,CFTC Commissioner Stresses: Ethereum Is Under Our Jurisdiction,-1,0
28,Ukraine’s Security Service Closes Illegal Cryptocurrency Exchanges,-1,0
30,"Infrastructure bill passes, Coinbase posts $1.6 billion in Q2 profit, $600 million stolen in DeFi hack: Hodler’s Digest, Aug.8-14",0,1
31,"NFTs are unlocking a new future for sports fans, even if not everyone’s sold",1,0
35,"Bitcoin’s Blockchain Is The Timechain, Let’s Call It That",0,1
53,"Wealth managers gain exposure to Bitcoin via Grayscale, according to new SEC filings",1,0
60,Crypto Tug Of War: President And Central Bank of Argentina Don’t See Eye To Eye,0,-1
61,Bitcoin's Surge Lacks Extreme Leverage That Supported Its Past Rallies,-1,1
62,How to mine ether for maximum profits: The CEO of a company that operates 7 mining farms breaks down how to pick the right equipment and manage electricity costs for optimal gains,1,0


# Examples

In [45]:
predict_post("""
Story from Markets Cardano Price Hits All-Time High, Overtakes Binance Coin as Third-Most Valuable Crypto
""")

1

In [46]:
predict_post("""
Binance Suspends Futures in Brazil Citing Regulatory Requirements
""")

-1

In [47]:
predict_post("""
What’s Really Going on With OnlyFans and Payment Censorship
""")

0

In [48]:
predict_post("""
Sentinel Network Reports Theft of 40M DVPN Coins in HitBTC Breach
""")

-1

In [49]:
predict_post("""
Market Wrap: Bitcoin Rallies Ahead of $50K Resistance
""")

1

In [50]:
predict_post("""
SEC Secures Judgments Against 3 in Bitconnect Scam
""")

0

In [51]:
predict_post("""
Why Bitcoin, Ethereum, and Dogecoin Are All Soaring Today
""")

1

In [52]:
predict_post("""
Ethereum Co-Founder Not Sold on Bitcoin-Fueled DeFi
""")

-1

In [53]:
predict_post("""
AMP price prediction 2021: Can the cryptocurrency reach $0.1?
""")

1