# Libraries

In [1]:
from social_text_processing import *
from datetime import datetime
import pickle
import numpy as np
import emoji
from oauth2client.service_account import ServiceAccountCredentials
import gspread
import pandas as pd
import re
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from nltk.stem.snowball import SnowballStemmer
import warnings
warnings.filterwarnings('ignore')
import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import os
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
RUNTIME = str(datetime.today())
RUNTIME = RUNTIME.replace('-', '').replace(' ', '').replace(':', '')[0:-13]

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Read In Data

## Functions to Read Data

In [4]:
SCOPES = ['https://spreadsheets.google.com/feeds',
          'https://www.googleapis.com/auth/drive']
LABELED_POSTS_SHEET = '1Dd5Ug-PeAOSrPm4zUUhBTvZkjgm14HWZwY_h8oj79Xg'
CREDENTIALS = 'google-credentials.json'
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, SCOPES)
client = gspread.authorize(creds)

In [5]:
def read_sheets(sheet_id):
    df = pd.DataFrame()
    spreadsheet = client.open_by_key(sheet_id)
    
    for i in range(len(spreadsheet.worksheets())):
        this_df = pd.DataFrame(spreadsheet.get_worksheet(i).get_all_records())
        df = df.append(this_df)
    
    df = df[['text', 'label']]
    df.dropna(inplace=True)
    df = df[df['label'] != '']
    df = df[df['text'] != '']
    
    return df
        

In [6]:
df = read_sheets(LABELED_POSTS_SHEET)
# df_train.dropna(inplace=True)

In [7]:
df['label'].value_counts()

 0    3209
 1     824
-1     315
Name: label, dtype: int64

# Resample

In [8]:
positives = df[df['label'] == 1]
negatives = df[df['label'] == -1]
neutral = df[df['label'] == 0]
sampled_neutral = neutral.sample(frac=1)[0:1000]

In [9]:
combined = positives.append(negatives).append(sampled_neutral).sample(frac=1)

In [10]:
combined.shape[0]

2139

## Create Train / Test Sets

In [11]:
VOCAB_SIZE = 3000
TRAIN_PORTION = .5

In [12]:
X = df['text']
y = df['label'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, 
                                                    shuffle=True, train_size=TRAIN_PORTION)

In [13]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_fitted = label_encoder.transform(y_train)
y_test_fitted = label_encoder.transform(y_test)

# Emojis

In [14]:
nlp = spacy.load('en')
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)

# Pipeline

## Preprocessing Pipeline for Tokenizer

In [15]:
preprocessing_pipeline = Pipeline(
    steps=[
        ('remove_punctuation', RemovePunctuation()),
        ('remove_urls', RemoveURLs()),
        ('remove_usernames', RemoveUsernames()),
        ('remove_stopwords', RemoveStopwords()),
        ('convert_emojis', ConvertEmojis()),
#         ('change_common_words', CommonWordAdjustments()),
        ('remove_numbers', RemoveNumbers()),
        ('stem', Stem()),
        ('lemmatize', Lemmatize()),
        ('export', Export()),
        ('drop_unneeded_features', FeatureSelector('cleaned_title')),
    ]
)

## Full Pipeline

In [16]:
full_pipeline = Pipeline(
    steps=[
        ('remove_punctuation', RemovePunctuation()),
        ('remove_urls', RemoveURLs()),
        ('remove_usernames', RemoveUsernames()),
        ('remove_stopwords', RemoveStopwords()),
        ('convert_emojis', ConvertEmojis()),
#         ('change_common_words', CommonWordAdjustments()),
        ('remove_numbers', RemoveNumbers()),
        ('stem', Stem()),
        ('lemmatize', Lemmatize()),
        ('tokenize', TokenizeText()),
        ('model', XGBClassifier(n_estimators=500))
    ]
)

## Preprocess Text

In [17]:
preprocessing_pipeline.fit(X_train, y_train)

[["rt user big mileston day solana devnet parti popper parti popper parti popper number peopl experienc soda lend ' ve receiv"], ['fetch team continu amaz user want cross chain swap good protocol actual secur user dust url'], ['polic car light polic car light polic car light polic car light polic car light polic car light polic car light polic car light polic car light polic car light number eth number , number usd transfer unknown wallet unknown wallet url'], ['solana url'], ['rt user look passiv incom end search user blue circl tg http']]


Pipeline(steps=[('remove_punctuation', RemovePunctuation()),
                ('remove_urls', RemoveURLs()),
                ('remove_usernames', RemoveUsernames()),
                ('remove_stopwords', RemoveStopwords()),
                ('convert_emojis', ConvertEmojis()),
                ('remove_numbers', RemoveNumbers()), ('stem', Stem()),
                ('lemmatize', Lemmatize()), ('export', Export()),
                ('drop_unneeded_features',
                 FeatureSelector(feature_names='cleaned_title'))])

## Tokenize Text

In [18]:
processed_text = pd.read_csv('processed_text.csv')
text = processed_text['text']
vectorizer = CountVectorizer(ngram_range=(1,1))
vectorizer.fit(text)

with open(f'tokenizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Full Pipeline

In [19]:
param_grid = {
    'model__n_estimators': [25, 100, 500, 1000],
    'model__learning_rate': [.1, .3, .7]
}

In [20]:
full_pipeline.fit(X_train, list(y_train))
# grid = GridSearchCV(full_pipeline, param_grid, n_jobs=-1, scoring='roc_auc')
# grid.fit(X_train, list(y_train))



Pipeline(steps=[('remove_punctuation', RemovePunctuation()),
                ('remove_urls', RemoveURLs()),
                ('remove_usernames', RemoveUsernames()),
                ('remove_stopwords', RemoveStopwords()),
                ('convert_emojis', ConvertEmojis()),
                ('remove_numbers', RemoveNumbers()), ('stem', Stem()),
                ('lemmatize', Lemmatize()), ('tokenize', TokenizeText()),
                ('model',
                 XGBClassifier(base_score=0.5, booste...
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=500,
                               n_jobs=16, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                

# Predict - Evaluation Set

In [21]:
label_maps = {0: -1, 1: 0, 2: 1}
eval_preds = label_encoder.transform(full_pipeline.predict(X_train))

In [22]:
print (classification_report(eval_preds, y_train_fitted))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       153
           1       1.00      0.99      0.99      1586
           2       0.97      0.98      0.98       435

    accuracy                           0.99      2174
   macro avg       0.98      0.99      0.98      2174
weighted avg       0.99      0.99      0.99      2174



# Export Model

In [23]:
joblib.dump(full_pipeline, f'social_bow_model_{RUNTIME}.joblib')

['social_bow_model_20210929.joblib']

# Test Set

In [24]:
loaded_model = joblib.load(f'social_bow_model_{RUNTIME}.joblib')

In [25]:
label_maps = {2: "Positive", 1: "Neutral", 0: "Negative"}
def predict_text(text):
    prediction = label_encoder.transform(loaded_model.predict([text]))
    
    return label_maps[prediction[0]]

In [26]:
test_preds = loaded_model.predict(X_test)

In [27]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

          -1       0.48      0.29      0.36       155
           0       0.85      0.89      0.87      1634
           1       0.56      0.52      0.54       385

    accuracy                           0.78      2174
   macro avg       0.63      0.57      0.59      2174
weighted avg       0.77      0.78      0.78      2174



In [28]:
df = pd.DataFrame(data = {
    'text': X_test,
    'actual': y_test,
    'predicted': test_preds
})

In [29]:
df[df['actual'] != df['predicted']]

Unnamed: 0,text,actual,predicted
722,RT @CryptoAlexand: Now you must wait 😅\n But if you staking your $FET you can get $MOBIX #stakedrop. Distribution in - October - December\n \n\n @F…,0,1
81,RT @bpascowitch: Bullish signs everywhere! \n\n#bitcoin\n#ethereum\n#cardano\n#solana,0,1
65,RT @CardanoHypes: I will giveaway 500 $ADA if #Cardano reaches $3 in 24 hours! 😱🚀\n\nMUST:\n&gt; RT + LIKE \n&gt; Follow me,1,0
355,RT @MOONSHOTJOSH: @papousse47 @Fetch_ai Mark your calendar for $FET deep parking demos. #Jaguar #Bmw #Tesla in early September. https://t.c…,1,0
261,RT @daimetti: @Fetch_ai #Ai #CollectiveLearning #nft Made my votes for the AI driven NFT collection. Some awesome stuff on here. @HMsheikh…,0,1
44,"“Without a strong and scalable social layer, crypto-networks/ecosystems tend to fall apart and/or get bogged down by constant infighting over trivial topics. Within Ethereum, there is currently a bit of elitism around layer 2’s (rollups) vs sidechains which has led to some drama”",-1,0
400,"@AkashTrade @Mettalex I have been averaging in on $MTLX for a few months now. Only token that has gotten My Fiat in over a year. Love the project, and @Fetch_ai team. Learned of them when I got about 20 free $MTLX through the $FET Phoenix staking program.",0,1
167,"@cardano_bull @garyvee You think he got handed that 100 mills to him haha, he worked for them and believed in himself, do the same ;)",0,1
80,RT @CryptoWizardd: STILL AWAKE\n\n6th time last tweet\n\nDont miss $Fet\n\nZoom out. Relax and enjoy the ride\n\n@Fetch_ai https://t.co/UCJfxF8Bmf,1,0
220,@Solana_Alpha If you managed to get even 1 you are quite special lol,0,-1


# New Examples

In [30]:
predict_text("$ETH $DOGE $SHIB To The Moon")

'Neutral'

In [31]:
predict_text("Somewhere In Florida #Doge #Dogecoin #Crypto")

'Neutral'