# Train a regressor on a set of embeddings of tweet texts

Use **GetOldTweets3** library (available via Pypi)

## Install required libraries

In [1]:
!pip install GetOldTweets3
import GetOldTweets3 as got



In [2]:
# !pip install basilica # might have to install, if not available in underlying environment

In [3]:
import pandas as pd
# import json
import numpy as np
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.decomposition
import basilica
import pickle
from pathlib import Path


## Configure

In [4]:
twitter_user_name = 'LambdaSchool'
count = 100 # during testing
API_KEY = '7f8e7b80-be40-8936-1a61-cef3d1926786'

## Data Engineering

In [5]:
def tweet_to_dict(twt):
    """Munges a twt object into a dict, using names of attributes of
    object as keys in dict.
    'favorites' is a count of 'likes'
    'hashtags' is a string that is a space-separated series of hashtags
    'mentions' is a string that is a space-separated series of ats (@s)
    'urls' is a string that is a space-separated series of URLs
    """
    return {'date' : twt.date
            , 'favorites' : twt.favorites
            , 'formatted_date' : twt.formatted_date
            , 'geo' : twt.geo
            , 'hashtags' : twt.hashtags
            , 'id' : twt.id
            , 'mentions' : twt.mentions
            , 'permalink' : twt.permalink
            , 'replies' : twt.replies
            , 'retweets' : twt.retweets
            , 'text' : twt.text
            , 'to' : twt.to
            , 'urls' : twt.urls
            , 'username' : twt.username}   

def munge_date(dt):
    """Munges a datetime.datetime object into a dict, using names of attributes of
    object as keys in dict.
    'day_of_week' is [0-7] with 0 being 'Monday'
    'minute_of_day' is count of minutes from midnight"""
    return {'year' : dt.year 
            , 'month' : dt.month
            , 'day' : dt.day
            , 'day_of_week' : dt.weekday()
            , 'hour' : dt.hour
            , 'minute' : dt.minute
            , 'minute_of_day' : (60 * dt.hour) + dt.minute}


def join_dicts(got_tweet_object):
    """Returns a dict that is the result of joining 
    - a dict that is the result of parsing a GOT object
      to  dict, and
    - a dict that is the result of munging the a datetime.date
      into a dict."""
    return {**tweet_to_dict(got_tweet_object), **munge_date(got_tweet_object.date)}

def munge_tweet_objects(tweet_objects):
    return list(map(join_dicts, tweet_objects))


In [8]:
# get a set of tweets
pickled_fn = './r_tweets.pickle'
pickled_path = Path(pickled_fn)

# if a pickled file already exists, unpickle it
if pickled_path.is_file():
    merged_df = pd.read_pickle(pickled_fn)

# if a pickled file does not exist yet, get data then pickle it
else:  
    #  Create object to execute queries
    querySpecs = got.manager.TweetCriteria().setUsername(twitter_user_name).setMaxTweets(count)
   
    print('Retrieving tweets via GOT3')
    # retrieve tweets
    retrieved_tweets = got.manager.TweetManager.getTweets(querySpecs)
    
    tweet_dicts = munge_tweet_objects(retrieved_tweets)
    
    y_retweets = pd.DataFrame.from_records(tweet_dicts,  columns=['retweets'])
    y_retweets = y_retweets.fillna(0)

    y_likes = pd.DataFrame.from_records(tweet_dicts,  columns=['likes'])
    y_likes = y_likes.fillna(0)
    
    columns_not_needed = ['id', 'hashtags', 'replies', 'retweets', 'text',
       'to', 'urls', 'year', 'month', 'day', 'date', 'formatted_date', 'permalink', 'username', 'hour', 'minute', 'geo']
    times_df = pd.DataFrame.from_records(tweet_dicts,  exclude=columns_not_needed)

    # create a df of embeddings of the texts
    tweet_texts = [tweet.text for tweet in retrieved_tweets]
    print('retrieving embeddings via basilica')
    with basilica.Connection(API_KEY) as c:
        embeddings = list(c.embed_sentences(tweet_texts))
    print("Retrieved " + str(len(embeddings)) + " embeddings.")
    normalized_embeddings = sklearn.preprocessing.normalize(embeddings)
    colnames = ['embed_col' + str(i) for i in range(len(embeddings[0]))]
    normalized_embeddings_df = pd.DataFrame(normalized_embeddings, columns=colnames)  
    
    merged_df     =     pd.merge(times_df, normalized_embeddings_df, right_index=True, left_index=True)

    # pickle the df
    merged_df.to_pickle(pickled_fn)

Retrieving tweets via GOT3
retrieving embeddings via basilica
Retrieved 100 embeddings.


In [41]:
type(y_retweets.values[0][0])


numpy.int64

In [40]:
type(y_likes.values[0])

numpy.int64

In [21]:
# colnames = ['embed_col' + str(i) for i in range(len(embeddings[0]))]

# # put the normalized embeddings back in a dataframe
# normalized_embeddings_df = pd.DataFrame(normalized_embeddings, columns=colnames)
# # PCA the embeddings
# # sklearn.get_config()
# normalized_embeddings_df.shape

# normalized_embeddings_df.head()

## Split into X matrix (embeddings) and y vector (retweet or like count)

In [22]:
y_likes = merged_df['favorites']
print(y_likes.shape)
print(y_likes.isna().value_counts())

(100,)
False    100
Name: favorites, dtype: int64


In [23]:
print(merged_df.shape)

(100, 772)


## Training Regressor

In [24]:
# import numpy as np
# import sklearn.linear_model
# import sklearn.preprocessing
# import sklearn.model_selection

In [28]:
X = times_df

X_train, X_test = sklearn.model_selection.train_test_split(X, random_state=72)
y_retweets_train, y_retweets_test = sklearn.model_selection.train_test_split(y_retweets, random_state=72)

# y_likes_train, y_likes_test = sklearn.model_selection.train_test_split(y_likes, random_state=72)

# retweets_model = sklearn.linear_model.LogisticRegression(max_iter=10000)
# retweets_model.fit(X_train, y_retweets_train)

In [31]:
likes_model = sklearn.linear_model.LogisticRegression(max_iter=100)
likes_model.fit(X_train, y_likes_train)

ValueError: could not convert string to float: 

## Results

In [None]:
print('Retweets Train accuracy: %.3f' % retweets_model.score(X_train, y_retweets_train))
print('Retweets Test accuracy: %.3f' % retweets_model.score(X_test, y_retweets_test))

print('Likes Train accuracy: %.3f' % likes_model.score(X_train, y_likes_train))
print('Likes Test accuracy: %.3f' % likes_model.score(X_test, y_likes_test))

In [None]:
# y_retweets[0:10]

In [None]:
def predict_retweets_one_embedding(embdng):
    """Use model to predict based on one embedding."""
    return retweets_model.predict(embdng)[0]

def predict_retweets_one_by_index(embeddings_array, idx):
    """Use model to predict based on one embedding,
    selected by index from a list of embeddings."""
    return ('predicted retweets', 
            predict_retweets_one_embedding(embeddings_array[idx].reshape(1,-1)), 
            'actual', y_retweets[idx])


def predict_likes_one_embedding(embdng):
    """Use model to predict based on one embedding."""
    return likes_model.predict(embdng)[0]

def predict_likes_one_by_index(embeddings_array, idx):
    """Use model to predict based on one embedding,
    selected by index from a list of embeddings."""
    return  ('predicted likes', 
             predict_likes_one_embedding(embeddings_array[idx].reshape(1,-1)),
             y_likes[idx])
             

foo = merged_df.values
predict_retweets_one_by_index(foo,5)
predict_likes_one_by_index(foo,5)