# Train a regressor on a set of embeddings of tweet texts

Use **GetOldTweets3** library (available via Pypi)

In [1]:
!pip install GetOldTweets3
import GetOldTweets3 as got



In [2]:
# !pip install basilica # might have to install, if not available in underlying environment

In [3]:
import pandas as pd
import json
import numpy as np
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.decomposition
import basilica

In [4]:
twitter_user_name = 'LambdaSchool'
count = 1000 # during testing

In [5]:
#  Create object to execute queries
querySpecs = got.manager.TweetCriteria().setUsername(twitter_user_name).setMaxTweets(count)

In [6]:
# get a set of tweets
retrieved_tweets = got.manager.TweetManager.getTweets(querySpecs)
len(retrieved_tweets)

1000

In [7]:
def tweet_to_dict(twt):
    """Munges a twt object into a dict, using names of attributes of
    object as keys in dict.
    'favorites' is a count of 'likes'
    'hashtags' is a string that is a space-separated series of hashtags
    'mentions' is a string that is a space-separated series of ats (@s)
    'urls' is a string that is a space-separated series of URLs
    """
    return {'date' : twt.date
            , 'favorites' : twt.favorites
            , 'formatted_date' : twt.formatted_date
            , 'geo' : twt.geo
            , 'hashtags' : twt.hashtags
            , 'id' : twt.id
            , 'mentions' : twt.mentions
            , 'permalink' : twt.permalink
            , 'replies' : twt.replies
            , 'retweets' : twt.retweets
            , 'text' : twt.text
            , 'to' : twt.to
            , 'urls' : twt.urls
            , 'username' : twt.username}   

In [8]:
df=pd.DataFrame(list(map(tweet_to_dict, retrieved_tweets)))
df.head()

Unnamed: 0,date,favorites,formatted_date,geo,hashtags,id,mentions,permalink,replies,retweets,text,to,urls,username
0,2020-04-15 15:30:11+00:00,4,Wed Apr 15 15:30:11 +0000 2020,,,1250446327902863361,,https://twitter.com/LambdaSchool/status/125044...,0,2,Prepare yourself for success by attending Lamb...,,https://bit.ly/34nvh6K,LambdaSchool
1,2020-04-14 18:55:02+00:00,4,Tue Apr 14 18:55:02 +0000 2020,,#RemoteLearning #Remote,1250135493662171136,,https://twitter.com/LambdaSchool/status/125013...,0,2,Tune in tomorrow at 11 am PT to hear how Lambd...,,https://bit.ly/3a4BghW,LambdaSchool
2,2020-04-14 15:31:05+00:00,25,Tue Apr 14 15:31:05 +0000 2020,,#hired #SeeYourselfAtLambda,1250084165405597702,@Fidelity @ryanallred @rrherr @Jon_Cody_,https://twitter.com/LambdaSchool/status/125008...,0,4,Congrats to Lambda grad Oscar Calzada for gett...,,,LambdaSchool
3,2020-04-13 23:00:16+00:00,20,Mon Apr 13 23:00:16 +0000 2020,,#RemoteLearning #Remote,1249834817878986753,,https://twitter.com/LambdaSchool/status/124983...,0,3,"For large online lectures of 50+ students, how...",,https://bit.ly/3a4BghW,LambdaSchool
4,2020-04-13 20:30:25+00:00,19,Mon Apr 13 20:30:25 +0000 2020,,#LambdaSchool,1249797109382967296,@Divvy_HQ,https://twitter.com/LambdaSchool/status/124979...,2,2,"One of our 2019 grad Antonio Melendez, spent f...",,http://bit.ly/39htNvO,LambdaSchool


In [9]:
# df.head()

In [10]:
# we only need the texts for making embeddings
test_texts = [tweet.text for tweet in retrieved_tweets]
len(test_texts)

1000

In [12]:
API_KEY = 
# get a real one
# API_KEY = 'SLOW_DEMO_KEY'



# volume of use of this is monitored by basilica, especially when usinf 'SLOW_DEMO_KEY'
# uncomment and run only when needed


with basilica.Connection(API_KEY) as c:
    embeddings = list(c.embed_sentences(test_texts))
    
len(embeddings)

1000

In [13]:
# Normalize to help regressor work better
normalized_embeddings = sklearn.preprocessing.normalize(embeddings)
len(normalized_embeddings)
# normalized_embeddings[0]

# put the normalized embeddings back in a dataframe
normalized_embeddings_df = pd.DataFrame(normalized_embeddings)
# PCA the embeddings
# sklearn.get_config()
normalized_embeddings_df.shape

(1000, 768)

## Split into X matrix (embeddings) and y vector (retweet count)

In [23]:
X = normalized_embeddings_df

y_retweets =df['retweets']
print(y_retweets.shape)
print(y_retweets.isna().value_counts())

y_likes =df['favorites']
print(y_likes.shape)
print(y_likes.isna().value_counts())

(1000,)
False    1000
Name: retweets, dtype: int64
(1000,)
False    1000
Name: favorites, dtype: int64


In [15]:
print(X.shape)

(1000, 768)


## Training a regressor

In [None]:
# import numpy as np
# import sklearn.linear_model
# import sklearn.preprocessing
# import sklearn.model_selection

In [27]:
X_train, X_test = sklearn.model_selection.train_test_split(X, random_state=72)
y_retweets_train, y_retweets_test = sklearn.model_selection.train_test_split(y_retweets, random_state=72)


y_likes_train, y_likes_test = sklearn.model_selection.train_test_split(y_likes, random_state=72)


retweets_model = sklearn.linear_model.LogisticRegression(max_iter=1000)
retweets_model.fit(X_train, y_retweets_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:

likes_model = sklearn.linear_model.LogisticRegression(max_iter=1000)
likes_model.fit(X_train, y_likes_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Results

In [30]:
print('Retweets Train accuracy: %.3f' % retweets_model.score(X_train, y_retweets_train))
print('Retweets Test accuracy: %.3f' % retweets_model.score(X_test, y_retweets_test))

print('Likes Train accuracy: %.3f' % likes_model.score(X_train, y_likes_train))
print('Likes Test accuracy: %.3f' % likes_model.score(X_test, y_likes_test))

Retweets Train accuracy: 0.336
Retweets Test accuracy: 0.272
Likes Train accuracy: 0.128
Likes Test accuracy: 0.088


In [None]:
# y_retweets[0:10]

In [54]:
def predict_retweets_one_embedding(embdng):
    """Use model to predict based on one embedding."""
    return ('predicted retweets', retweets_model.predict([embdng])[0])

def predict_retweets_one_by_index(embeddings, idx):
    """Use model to predict based on one embedding,
    selected by index from a list of embeddings."""
    return (predict_retweets_one_embedding(embeddings[idx]), 'actual', y_retweets[idx])


def predict_likes_one_embedding(embdng):
    """Use model to predict based on one embedding."""
    return ('predicted likes', likes_model.predict([embdng])[0])

def predict_likes_one_by_index(embeddings, idx):
    """Use model to predict based on one embedding,
    selected by index from a list of embeddings."""
    return (predict_likes_one_embedding(embeddings[idx]), 'actual', y_likes[idx])

print(predict_retweets_one_by_index(embeddings,2))

print(predict_likes_one_by_index(embeddings,2))

(('predicted retweets', 2), 'actual', 4)
(('predicted likes', 25), 'actual', 25)
