# Libraries

In [41]:
import spacy
from text_processing import *
from datetime import datetime
import pickle
import numpy as np
import emoji
from spacymoji import Emoji
from oauth2client.service_account import ServiceAccountCredentials
import gspread
import pandas as pd
import re
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from nltk.stem.snowball import SnowballStemmer
import warnings
warnings.filterwarnings('ignore')
import joblib
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from collections import Counter

In [42]:
RUNTIME = str(datetime.today())
RUNTIME = RUNTIME.replace('-', '').replace(' ', '').replace(':', '')[0:-13]

In [43]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Spacy Settings

In [44]:
nlp = spacy.load("en")
emoji_pipeline = Emoji(nlp)
nlp.add_pipe(emoji_pipeline, first=True)
stemmer = SnowballStemmer(language='english')

# Read In Data

## Functions to Read Data

In [45]:
SCOPES = ['https://spreadsheets.google.com/feeds',
          'https://www.googleapis.com/auth/drive']
LABELED_NEWS = '1ZSZ1m4qlBRSkXT2Hjhv2lYNFHlSbfjw5r4cRIbDi1Rk'
CREDENTIALS = 'google-credentials.json'
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, SCOPES)
client = gspread.authorize(creds)

In [46]:
def read_sheet(sheet_id, sheet_number):
    sheet = client.open_by_key(sheet_id).get_worksheet(sheet_number)
    df = pd.DataFrame(sheet.get_all_records())
    df.to_csv('labeled_data.csv')
    df = pd.read_csv('labeled_data.csv')
    os.remove('labeled_data.csv')
    df = df.drop('Unnamed: 0', axis=1)
#     df.dropna(subset=['title'], inplace=True)
    
    return df[['title', 'label']]

In [47]:
df_train = read_sheet(LABELED_NEWS, 0)
# df_train.dropna(inplace=True)

In [48]:
df_test = read_sheet(LABELED_NEWS, 1)

## Create Train / Test Sets

In [49]:
VOCAB_SIZE = 500

In [50]:
encoder = LabelEncoder()

In [51]:
X = df_train.loc[:, df_train.columns != 'sentiment']
y = df_train['label']

# Pipeline

## Preprocessing Pipeline for Tokenizer

In [52]:
preprocessing_pipeline = Pipeline(
    steps=[
        ('change_common_words', CommonWordAdjustments()),
        ('remove_urls', RemoveURLs()),
        ('remove_numbers', RemoveNumbers()),
        ('remove_punctuation', RemovePunctuation()),
        ('remove_stopwords', RemoveStopwords()),
        ('stem', Stem()),
        ('lemmatize', Lemmatize()),
        ('export', Export()),
        ('drop_unneeded_features', FeatureSelector('cleaned_title')),
    ]
)

## Full Pipeline

In [53]:
full_pipeline = Pipeline(
    steps=[
        ('change_common_words', CommonWordAdjustments()),
        ('remove_urls', RemoveURLs()),
        ('remove_numbers', RemoveNumbers()),
        ('remove_punctuation', RemovePunctuation()),
        ('remove_stopwords', RemoveStopwords()),
        ('stem', Stem()),
        ('lemmatize', Lemmatize()),
        ('drop_unneeded_features', FeatureSelector('cleaned_title')),
        ('tokenize', TokenizeText()),
        ('model', LogisticRegression())
    ]
)

## Preprocess Text

In [54]:
preprocessing_pipeline.fit(X, y)

Pipeline(steps=[('change_common_words', CommonWordAdjustments()),
                ('remove_urls', RemoveURLs()),
                ('remove_numbers', RemoveNumbers()),
                ('remove_punctuation', RemovePunctuation()),
                ('remove_stopwords', RemoveStopwords()), ('stem', Stem()),
                ('lemmatize', Lemmatize()), ('export', Export()),
                ('drop_unneeded_features',
                 FeatureSelector(feature_names='cleaned_title'))])

## Tokenize Text

In [55]:
processed_text = pd.read_csv('processed_text.csv')

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
processed_text = pd.read_csv('processed_text.csv')
text = processed_text['cleaned_title']
vectorizer = CountVectorizer(ngram_range=(1,1))
vectorizer.fit(text)

with open(f'tokenizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Full Pipeline

In [57]:
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [1, 3]
}

In [58]:
full_pipeline.fit(X, y)
# full_pipeline = GridSearchCV(full_pipeline, param_grid)
# full_pipeline.fit(X, y)

Pipeline(steps=[('change_common_words', CommonWordAdjustments()),
                ('remove_urls', RemoveURLs()),
                ('remove_numbers', RemoveNumbers()),
                ('remove_punctuation', RemovePunctuation()),
                ('remove_stopwords', RemoveStopwords()), ('stem', Stem()),
                ('lemmatize', Lemmatize()),
                ('drop_unneeded_features',
                 FeatureSelector(feature_names='cleaned_title')),
                ('tokenize', TokenizeText()), ('model', LogisticRegression())])

# Predict - Evaluation Set

In [59]:
label_maps = {0: -1, 1: 0, 2: 1}
eval_preds = full_pipeline.predict(X)

In [60]:
Counter(eval_preds)

Counter({-1: 472, 0: 1189, 1: 645})

In [61]:
print (classification_report(eval_preds, y))

              precision    recall  f1-score   support

          -1       0.96      0.98      0.97       472
           0       0.99      0.98      0.98      1189
           1       0.98      0.98      0.98       645

    accuracy                           0.98      2306
   macro avg       0.98      0.98      0.98      2306
weighted avg       0.98      0.98      0.98      2306



In [62]:
df_train['label'].value_counts()

 0    1179
 1     646
-1     481
Name: label, dtype: int64

In [63]:
df_train['preds'] = eval_preds
df_train[df_train['preds'] != df_train['label']].head()

Unnamed: 0,title,label,preds
36,Bitcoin Bulls Falter Despite Gaining 14% In July,-1,0
53,Square isn't lighting it up on bitcoin this year,-1,0
118,5 Ultra-Popular Stocks to Avoid Like the Plague in August,-1,0
214,Digital payment platform Square to buy Afterpay for $29 bn,1,0
230,How Bitcoin Solves The Store Of Value Problem,1,0


# Export Model

In [64]:
joblib.dump(full_pipeline, f'news_bow_model_{RUNTIME}.joblib')

['news_bow_model_20210829.joblib']

# Test Set

In [65]:
loaded_model = joblib.load(f'news_bow_model_{RUNTIME}.joblib')

In [66]:
def predict_post(post, return_option='prediction'):
    df = pd.DataFrame(data={'title': [post]})
    
    probabilities = loaded_model.predict_proba(df)[0]
    confidence = np.max(probabilities, axis=0)
    prediction = label_maps[np.argmax(probabilities, axis=0)]
    
    return prediction

In [67]:
# df_train['prediction'] = df_train['title'].apply(predict_post)

In [68]:
df_test.dropna(inplace=True)

In [69]:
df_test['prediction'] = df_test['title'].apply(predict_post)

In [70]:
print(classification_report(df_test['prediction'], df_test['label']))

              precision    recall  f1-score   support

          -1       0.41      0.57      0.48        51
           0       0.77      0.71      0.74       303
           1       0.58      0.61      0.59       143

    accuracy                           0.66       497
   macro avg       0.59      0.63      0.60       497
weighted avg       0.68      0.66      0.67       497



In [71]:
df_test[df_test['prediction'] != df_test['label']]

Unnamed: 0,title,label,prediction
1,Immutable’s Guild of the Guardians NFT game teams up with NRG Esports,0,1
11,Small Traders Pile Back Into Cryptocurrencies,0,1
19,Bitcoin’s off-chain data points to more upward momentum for BTC price,1,0
24,CFTC Commissioner Stresses: Ethereum Is Under Our Jurisdiction,-1,0
28,Ukraine’s Security Service Closes Illegal Cryptocurrency Exchanges,-1,0
30,"Infrastructure bill passes, Coinbase posts $1.6 billion in Q2 profit, $600 million stolen in DeFi hack: Hodler’s Digest, Aug.8-14",0,-1
31,"NFTs are unlocking a new future for sports fans, even if not everyone’s sold",1,0
32,Polygon Hermez: The First Full-Blown Merger of Two Blockchain Networks,0,1
35,"Bitcoin’s Blockchain Is The Timechain, Let’s Call It That",0,1
38,"Bitcoin Rises Over 7 Percent to Breach $47,500",1,0


# Examples

In [72]:
predict_post("""
Story from Markets Cardano Price Hits All-Time High, Overtakes Binance Coin as Third-Most Valuable Crypto
""")

1

In [73]:
predict_post("""
Binance Suspends Futures in Brazil Citing Regulatory Requirements
""")

-1

In [74]:
predict_post("""
What’s Really Going on With OnlyFans and Payment Censorship
""")

0

In [75]:
predict_post("""
Sentinel Network Reports Theft of 40M DVPN Coins in HitBTC Breach
""")

-1

In [76]:
predict_post("""
Market Wrap: Bitcoin Rallies Ahead of $50K Resistance
""")

-1

In [77]:
predict_post("""
SEC Secures Judgments Against 3 in Bitconnect Scam
""")

-1

In [78]:
predict_post("""
Why Bitcoin, Ethereum, and Dogecoin Are All Soaring Today
""")

1

In [79]:
predict_post("""
Ethereum Co-Founder Not Sold on Bitcoin-Fueled DeFi
""")

-1

In [80]:
predict_post("""
AMP price prediction 2021: Can the cryptocurrency reach $0.1?
""")

1