# Train a regressor on a set of embeddings of tweet texts

Use **GetOldTweets3** library (available via Pypi)

In [1]:
!pip install GetOldTweets3
import GetOldTweets3 as got



In [2]:
# !pip install basilica # might have to install, if not available in underlying environment

In [3]:
import pandas as pd
import json
import numpy as np
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.decomposition
import basilica

In [4]:
twitter_user_name = 'LambdaSchool'
count = 100 # during testing

In [5]:
#  Create object to execute queries
querySpecs = got.manager.TweetCriteria().setUsername(twitter_user_name).setMaxTweets(count)

In [6]:
# get a set of tweets
retrieved_tweets = got.manager.TweetManager.getTweets(querySpecs)
len(retrieved_tweets)

100

In [7]:
def tweet_to_dict(twt):
    """Munges a twt object into a dict, using names of attributes of
    object as keys in dict.
    'favorites' is a count of 'likes'
    'hashtags' is a string that is a space-separated series of hashtags
    'mentions' is a string that is a space-separated series of ats (@s)
    'urls' is a string that is a space-separated series of URLs
    """
    return {'date' : twt.date
            , 'favorites' : twt.favorites
            , 'formatted_date' : twt.formatted_date
            , 'geo' : twt.geo
            , 'hashtags' : twt.hashtags
            , 'id' : twt.id
            , 'mentions' : twt.mentions
            , 'permalink' : twt.permalink
            , 'replies' : twt.replies
            , 'retweets' : twt.retweets
            , 'text' : twt.text
            , 'to' : twt.to
            , 'urls' : twt.urls
            , 'username' : twt.username}   

In [8]:
def munge_date(dt):
    """Munges a datetime.datetime object into a dict, using names of attributes of
    object as keys in dict.
    'day_of_week' is [0-7] with 0 being 'Monday'
    'minute_of_day' is count of minutes from midnight"""
    return {'year' : dt.year 
            , 'month' : dt.month
            , 'day' : dt.day
            , 'day_of_week' : dt.weekday()
            , 'hour' : dt.hour
            , 'minute' : dt.minute
            , 'minute_of_day' : (60 * dt.hour) + dt.minute}

In [9]:
df=pd.DataFrame(list(map(tweet_to_dict, retrieved_tweets)))
df.head()

Unnamed: 0,date,favorites,formatted_date,geo,hashtags,id,mentions,permalink,replies,retweets,text,to,urls,username
0,2020-04-16 17:31:03+00:00,7,Thu Apr 16 17:31:03 +0000 2020,,#WebDev #DS #iOS,1250839133305606144,,https://twitter.com/LambdaSchool/status/125083...,0,7,Stuck at home? Enroll in Lambda School for 50%...,,https://bit.ly/2x7hxRt,LambdaSchool
1,2020-04-15 23:03:01+00:00,9,Wed Apr 15 23:03:01 +0000 2020,,#engineer #WomenWhoCode,1250560286635016198,,https://twitter.com/LambdaSchool/status/125056...,0,5,"Lambda grad, Kaitlyn Flynn, talks about her ex...",,https://bit.ly/33LqU55,LambdaSchool
2,2020-04-15 15:30:11+00:00,6,Wed Apr 15 15:30:11 +0000 2020,,,1250446327902863361,,https://twitter.com/LambdaSchool/status/125044...,0,2,Prepare yourself for success by attending Lamb...,,https://bit.ly/34nvh6K,LambdaSchool
3,2020-04-14 18:55:02+00:00,4,Tue Apr 14 18:55:02 +0000 2020,,#RemoteLearning #Remote,1250135493662171136,,https://twitter.com/LambdaSchool/status/125013...,0,2,Tune in tomorrow at 11 am PT to hear how Lambd...,,https://bit.ly/3a4BghW,LambdaSchool
4,2020-04-14 15:31:05+00:00,25,Tue Apr 14 15:31:05 +0000 2020,,#hired #SeeYourselfAtLambda,1250084165405597702,@Fidelity @ryanallred @rrherr @Jon_Cody_,https://twitter.com/LambdaSchool/status/125008...,0,4,Congrats to Lambda grad Oscar Calzada for gett...,,,LambdaSchool


In [10]:
def join_dicts(twt):
    return {**tweet_to_dict(twt), **munge_date(twt.date)}

recs = list(map(join_dicts, retrieved_tweets))

# index_col_name = 'id'

useless_columns = ['id', 'favorites', 'hashtags', 'mentions', 'replies', 'retweets', 'text',
       'to', 'urls', 'year', 'month', 'day', 'date', 'formatted_date', 'permalink', 'username', 'hour', 'minute', 'geo']

times_df = pd.DataFrame.from_records(recs,  exclude=useless_columns)

# times_df = pd.DataFrame.from_records(recs, index=index_col_name, exclude=useless_columns)

times_df.head()

Unnamed: 0,day_of_week,minute_of_day
0,3,1051
1,2,1383
2,2,930
3,1,1135
4,1,931


In [11]:
# we only need the texts for making embeddings
test_texts = [tweet.text for tweet in retrieved_tweets]
len(test_texts)

100

In [13]:
# get a real one

# API_KEY = 'SLOW_DEMO_KEY'

# volume of use of this is monitored by basilica, especially when usinf 'SLOW_DEMO_KEY'
# uncomment and run only when needed

with basilica.Connection(API_KEY) as c:
    embeddings = list(c.embed_sentences(test_texts))
    
len(embeddings)

100

In [14]:
# Normalize to help regressor work better
normalized_embeddings = sklearn.preprocessing.normalize(embeddings)

len(normalized_embeddings)
# normalized_embeddings[0]

100

In [15]:
colnames = ['embed_col' + str(i) for i in range(len(embeddings[0]))]

# put the normalized embeddings back in a dataframe
normalized_embeddings_df = pd.DataFrame(normalized_embeddings, columns=colnames)
# PCA the embeddings
# sklearn.get_config()
normalized_embeddings_df.shape

normalized_embeddings_df.head()

Unnamed: 0,embed_col0,embed_col1,embed_col2,embed_col3,embed_col4,embed_col5,embed_col6,embed_col7,embed_col8,embed_col9,...,embed_col758,embed_col759,embed_col760,embed_col761,embed_col762,embed_col763,embed_col764,embed_col765,embed_col766,embed_col767
0,0.023138,0.002753,0.044362,0.025983,0.028493,-0.021028,0.019673,0.023146,-0.006646,-0.040005,...,0.009362,-0.018145,-0.010557,-0.005621,0.012717,0.005946,-0.030827,-0.005184,0.005877,0.022622
1,0.000753,-0.013604,0.015583,0.019762,-0.002638,-0.003017,0.025656,0.019403,0.005415,-0.039027,...,0.016641,0.000325,-0.012965,-0.008949,0.019904,0.023236,-0.041716,-0.005318,0.007101,0.011204
2,0.018856,-0.020029,-0.004694,0.02204,-0.007703,-0.051445,0.014428,0.04415,-0.004812,-0.03656,...,0.022872,-0.016875,-0.004963,-0.019274,-0.010078,0.005856,-0.023959,-0.014859,0.021229,0.024489
3,0.02915,-0.009495,0.009707,0.028981,0.023361,-0.047286,0.015894,0.036981,0.005223,-0.027763,...,0.024674,-0.020838,-0.014686,-0.013213,0.007014,0.01732,-0.028305,0.006779,-0.001959,0.033246
4,0.008605,0.015254,0.030703,0.027617,-0.024997,-0.023521,0.019563,0.017938,-0.00125,-0.018717,...,0.006569,-0.017719,-0.00162,-0.002671,0.012569,0.011477,-0.032538,-0.002108,0.023439,0.017342


In [16]:
merged_df = pd.merge(times_df, normalized_embeddings_df, right_index=True, left_index=True)
merged_df

Unnamed: 0,day_of_week,minute_of_day,embed_col0,embed_col1,embed_col2,embed_col3,embed_col4,embed_col5,embed_col6,embed_col7,...,embed_col758,embed_col759,embed_col760,embed_col761,embed_col762,embed_col763,embed_col764,embed_col765,embed_col766,embed_col767
0,3,1051,0.023138,0.002753,0.044362,0.025983,0.028493,-0.021028,0.019673,0.023146,...,0.009362,-0.018145,-0.010557,-0.005621,0.012717,0.005946,-0.030827,-0.005184,0.005877,0.022622
1,2,1383,0.000753,-0.013604,0.015583,0.019762,-0.002638,-0.003017,0.025656,0.019403,...,0.016641,0.000325,-0.012965,-0.008949,0.019904,0.023236,-0.041716,-0.005318,0.007101,0.011204
2,2,930,0.018856,-0.020029,-0.004694,0.022040,-0.007703,-0.051445,0.014428,0.044150,...,0.022872,-0.016875,-0.004963,-0.019274,-0.010078,0.005856,-0.023959,-0.014859,0.021229,0.024489
3,1,1135,0.029150,-0.009495,0.009707,0.028981,0.023361,-0.047286,0.015894,0.036981,...,0.024674,-0.020838,-0.014686,-0.013213,0.007014,0.017320,-0.028305,0.006779,-0.001959,0.033246
4,1,931,0.008605,0.015254,0.030703,0.027617,-0.024997,-0.023521,0.019563,0.017938,...,0.006569,-0.017719,-0.001620,-0.002671,0.012569,0.011477,-0.032538,-0.002108,0.023439,0.017342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5,1019,0.003294,-0.006562,0.017269,-0.016713,0.012668,-0.029748,-0.007023,0.034168,...,0.016929,0.007263,0.000446,0.021965,0.006669,-0.027422,-0.015627,-0.017749,0.030071,0.036848
96,5,990,0.020828,0.012115,0.015596,0.009341,-0.033192,-0.040595,0.033833,0.059026,...,-0.019953,0.008822,-0.000378,-0.018992,0.022910,-0.001108,-0.038500,-0.002340,0.034554,0.039675
97,5,26,0.017811,0.017204,0.040197,0.005877,-0.019312,-0.021149,0.019543,0.025443,...,-0.001531,-0.012434,0.008800,0.006092,0.020851,-0.021068,-0.018303,0.003553,-0.003240,-0.005778
98,4,1099,-0.008194,0.008848,-0.002764,0.010198,0.010886,-0.017228,0.011725,0.008963,...,-0.001353,0.002677,-0.007044,-0.030552,0.008463,0.031810,-0.029013,-0.021065,0.009688,0.016168


## Split into X matrix (embeddings) and y vector (retweet count)

In [17]:
X = merged_df

y_retweets =df['retweets']
print(y_retweets.shape)
print(y_retweets.isna().value_counts())

y_likes =df['favorites']
print(y_likes.shape)
print(y_likes.isna().value_counts())

(100,)
False    100
Name: retweets, dtype: int64
(100,)
False    100
Name: favorites, dtype: int64


In [18]:
print(X.shape)

(100, 770)


## Training a regressor

In [None]:
# import numpy as np
# import sklearn.linear_model
# import sklearn.preprocessing
# import sklearn.model_selection

In [21]:
X_train, X_test = sklearn.model_selection.train_test_split(X, random_state=72)
y_retweets_train, y_retweets_test = sklearn.model_selection.train_test_split(y_retweets, random_state=72)


y_likes_train, y_likes_test = sklearn.model_selection.train_test_split(y_likes, random_state=72)


retweets_model = sklearn.linear_model.LogisticRegression(max_iter=10000)
retweets_model.fit(X_train, y_retweets_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:

likes_model = sklearn.linear_model.LogisticRegression(max_iter=10000)
likes_model.fit(X_train, y_likes_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Results

In [23]:
print('Retweets Train accuracy: %.3f' % retweets_model.score(X_train, y_retweets_train))
print('Retweets Test accuracy: %.3f' % retweets_model.score(X_test, y_retweets_test))

print('Likes Train accuracy: %.3f' % likes_model.score(X_train, y_likes_train))
print('Likes Test accuracy: %.3f' % likes_model.score(X_test, y_likes_test))

Retweets Train accuracy: 0.387
Retweets Test accuracy: 0.200
Likes Train accuracy: 0.227
Likes Test accuracy: 0.080


In [None]:
# y_retweets[0:10]

In [76]:
def predict_retweets_one_embedding(embdng):
    """Use model to predict based on one embedding."""
    return retweets_model.predict(embdng)[0]

def predict_retweets_one_by_index(embeddings_array, idx):
    """Use model to predict based on one embedding,
    selected by index from a list of embeddings."""
    return ('predicted retweets', 
            predict_retweets_one_embedding(embeddings_array[idx].reshape(1,-1)), 
            'actual', y_retweets[idx])


def predict_likes_one_embedding(embdng):
    """Use model to predict based on one embedding."""
    return likes_model.predict(embdng)[0]

def predict_likes_one_by_index(embeddings_array, idx):
    """Use model to predict based on one embedding,
    selected by index from a list of embeddings."""
    return  ('predicted likes', 
             predict_likes_one_embedding(embeddings_array[idx].reshape(1,-1)),
             y_likes[idx])
             

foo = merged_df.values
predict_retweets_one_by_index(foo,5)
predict_likes_one_by_index(foo,5)

('predicted likes', 6, 20)