# Libraries

In [2]:
from social_text_processing import *
from datetime import datetime
import pickle
import numpy as np
import emoji
from oauth2client.service_account import ServiceAccountCredentials
import gspread
import pandas as pd
import re
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from nltk.stem.snowball import SnowballStemmer
import warnings
warnings.filterwarnings('ignore')
import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import os
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

In [3]:
RUNTIME = str(datetime.today())
RUNTIME = RUNTIME.replace('-', '').replace(' ', '').replace(':', '')[0:-13]

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Read In Data

## Functions to Read Data

In [5]:
SCOPES = ['https://spreadsheets.google.com/feeds',
          'https://www.googleapis.com/auth/drive']
LABELED_POSTS_SHEET = '1Dd5Ug-PeAOSrPm4zUUhBTvZkjgm14HWZwY_h8oj79Xg'
CREDENTIALS = 'google-credentials.json'
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, SCOPES)
client = gspread.authorize(creds)

In [6]:
def read_sheets(sheet_id):
    df = pd.DataFrame()
    spreadsheet = client.open_by_key(sheet_id)
    
    for i in range(len(spreadsheet.worksheets())):
        this_df = pd.DataFrame(spreadsheet.get_worksheet(i).get_all_records())
        df = df.append(this_df)
    
    df = df[['text', 'label']]
    df.dropna(inplace=True)
    df = df[df['label'] != '']
    df = df[df['text'] != '']
    
    return df
        

In [7]:
df = read_sheets(LABELED_POSTS_SHEET)
# df_train.dropna(inplace=True)

In [8]:
df['label'].value_counts()

 0    3209
 1     823
-1     315
Name: label, dtype: int64

# Resample

In [9]:
positives = df[df['label'] == 1]
negatives = df[df['label'] == -1]
neutral = df[df['label'] == 0]
sampled_neutral = neutral.sample(frac=1)[0:1000]

In [10]:
combined = positives.append(negatives).append(sampled_neutral).sample(frac=1)

In [11]:
combined.shape[0]

2138

## Create Train / Test Sets

In [12]:
VOCAB_SIZE = 3000
TRAIN_PORTION = .5

In [13]:
X = df['text']
y = df['label'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, 
                                                    shuffle=True, train_size=TRAIN_PORTION)

In [14]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_fitted = label_encoder.transform(y_train)
y_test_fitted = label_encoder.transform(y_test)

# Emojis

In [15]:
nlp = spacy.load('en')
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)

# Pipeline

## Preprocessing Pipeline for Tokenizer

In [16]:
preprocessing_pipeline = Pipeline(
    steps=[
        ('remove_punctuation', RemovePunctuation()),
        ('remove_urls', RemoveURLs()),
        ('remove_usernames', RemoveUsernames()),
        ('remove_stopwords', RemoveStopwords()),
        ('convert_emojis', ConvertEmojis()),
#         ('change_common_words', CommonWordAdjustments()),
        ('remove_numbers', RemoveNumbers()),
        ('stem', Stem()),
        ('lemmatize', Lemmatize()),
        ('export', Export()),
        ('drop_unneeded_features', FeatureSelector('cleaned_title')),
    ]
)

## Full Pipeline

In [17]:
full_pipeline = Pipeline(
    steps=[
        ('remove_punctuation', RemovePunctuation()),
        ('remove_urls', RemoveURLs()),
        ('remove_usernames', RemoveUsernames()),
        ('remove_stopwords', RemoveStopwords()),
        ('convert_emojis', ConvertEmojis()),
#         ('change_common_words', CommonWordAdjustments()),
        ('remove_numbers', RemoveNumbers()),
        ('stem', Stem()),
        ('lemmatize', Lemmatize()),
        ('tokenize', TokenizeText()),
        ('model', XGBClassifier(n_estimators=500))
    ]
)

## Preprocess Text

In [18]:
preprocessing_pipeline.fit(X_train, y_train)

[["rt user new crypto giveaway**we'r give away number $ eth ethereum bitsquad communiti break number million total"], ['user url'], ['chariti crypto . user nft sale use smart contract rout fund direct user address charit institut url'], ['rt user user user user user user user user yeah stake acti'], ['rt user plan offer signific nicer stake reward $ agix cardano token e']]


Pipeline(steps=[('remove_punctuation', RemovePunctuation()),
                ('remove_urls', RemoveURLs()),
                ('remove_usernames', RemoveUsernames()),
                ('remove_stopwords', RemoveStopwords()),
                ('convert_emojis', ConvertEmojis()),
                ('remove_numbers', RemoveNumbers()), ('stem', Stem()),
                ('lemmatize', Lemmatize()), ('export', Export()),
                ('drop_unneeded_features',
                 FeatureSelector(feature_names='cleaned_title'))])

## Tokenize Text

In [19]:
processed_text = pd.read_csv('processed_text.csv')
text = processed_text['text']
vectorizer = CountVectorizer(ngram_range=(1,1))
vectorizer.fit(text)

with open(f'tokenizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Full Pipeline

In [20]:
param_grid = {
    'model__n_estimators': [25, 100, 500, 1000],
    'model__learning_rate': [.1, .3, .7]
}

In [21]:
full_pipeline.fit(X_train, list(y_train))
# grid = GridSearchCV(full_pipeline, param_grid, n_jobs=-1, scoring='roc_auc')
# grid.fit(X_train, list(y_train))



Pipeline(steps=[('remove_punctuation', RemovePunctuation()),
                ('remove_urls', RemoveURLs()),
                ('remove_usernames', RemoveUsernames()),
                ('remove_stopwords', RemoveStopwords()),
                ('convert_emojis', ConvertEmojis()),
                ('remove_numbers', RemoveNumbers()), ('stem', Stem()),
                ('lemmatize', Lemmatize()), ('tokenize', TokenizeText()),
                ('model',
                 XGBClassifier(base_score=0.5, booste...
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=500,
                               n_jobs=16, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                

# Predict - Evaluation Set

In [22]:
label_maps = {0: -1, 1: 0, 2: 1}
eval_preds = label_encoder.transform(full_pipeline.predict(X_train))

In [23]:
print (classification_report(eval_preds, y_train_fitted))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       141
           1       1.00      0.99      0.99      1627
           2       0.97      0.99      0.98       405

    accuracy                           0.99      2173
   macro avg       0.98      0.99      0.99      2173
weighted avg       0.99      0.99      0.99      2173



# Export Model

In [24]:
joblib.dump(full_pipeline, f'social_bow_model_{RUNTIME}.joblib')

['social_bow_model_20210929.joblib']

# Test Set

In [25]:
loaded_model = joblib.load(f'social_bow_model_{RUNTIME}.joblib')

In [26]:
label_maps = {2: "Positive", 1: "Neutral", 0: "Negative"}
def predict_text(text):
    prediction = label_encoder.transform(loaded_model.predict([text]))
    
    return label_maps[prediction[0]]

In [27]:
test_preds = loaded_model.predict(X_test)

In [28]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

          -1       0.46      0.27      0.34       171
           0       0.83      0.91      0.87      1591
           1       0.60      0.49      0.54       412

    accuracy                           0.78      2174
   macro avg       0.63      0.55      0.58      2174
weighted avg       0.76      0.78      0.76      2174



In [29]:
df = pd.DataFrame(data = {
    'text': X_test,
    'actual': y_test,
    'predicted': test_preds
})

In [30]:
df[df['actual'] != df['predicted']]

Unnamed: 0,text,actual,predicted
6,*LONG POSITION SUGGESTED FOR #BITCOIN*\n CURRENT PRICE: $40610.51\n EXCHANGE: BITGET(LICENCED IN THE US)\n STOPLOSS: $40001\n ACCURACY: 80%\n DATE: 2021-09-21 23:59:57.331341\n DON'T HAVE A BITGET ACCOUNT CREATE HERE:https://t.co/XdjFe16qMe https://t.co/eBX7aU0pds,1,0
81,RT @bpascowitch: Bullish signs everywhere! \n\n#bitcoin\n#ethereum\n#cardano\n#solana,0,1
456,"RT @antont71: According to @Investopedia ""Too big to fail"" describes a business or business sector deemed to be so deeply ingrained in a fi…",0,1
65,RT @CardanoHypes: I will giveaway 500 $ADA if #Cardano reaches $3 in 24 hours! 😱🚀\n\nMUST:\n&gt; RT + LIKE \n&gt; Follow me,1,0
96,@BunkFreamon can you explain how ethereum has consistently outperformed bitcoin my friend?,1,0
964,@binance @Fetch_ai Hello. I lost $15000 because of the Binance futures malfunction. This is all I have including my daughter’s tuition fees and the living expenses of the elderly in my family. Binance has verified that my situation is true but the response to me is unacceptable @BinanceHelpDesk,-1,0
80,RT @CryptoWizardd: STILL AWAKE\n\n6th time last tweet\n\nDont miss $Fet\n\nZoom out. Relax and enjoy the ride\n\n@Fetch_ai https://t.co/UCJfxF8Bmf,1,0
691,RT @michael_saylor: #Bitcoin is freedom.,1,0
523,Cardano had it right by having no concurrency. You apes need faster chains with all your bad decisions and rugs?\n\nCardano is a chain made FOR the people. Its for your safety. Its the only chain the SEC will approve to protect you degenerate apes. \n\nCardano is THE next ethereum.,1,-1
80,"RT @Crypto_Stuey: Why not make it a hat-trick, with #XRP and #ETN too!!😝\n\nThey are #DigitalPound bros!🤜🏾🤛\n\n🚀⚡️📱🌍\n\n@electroneum @digitalpoun…",0,1


# New Examples

In [36]:
predict_text("$ETH $DOGE $SHIB To The Moon")

'Positive'

In [37]:
predict_text("Somewhere In Florida #Doge #Dogecoin #Crypto")

'Neutral'