# Generic linear regression model

In [1]:
import numpy as np
import pandas as pd
import requests
import time
import matplotlib.pyplot as plt
import seaborn as sns
from pandasql import sqldf #for manipulating DataFrames with SQL style queries
import ast #for literally evaluating strings taht were jsons

In [21]:
df = pd.read_csv('media_objects.csv')
df.drop([0], axis = 0, inplace = True) #Drop column descriptors
df.reset_index(inplace = True)
#Drop unnecessary columns
df.drop(['index', 'media_object_id','timestamp', 'media_url'], axis = 1, inplace = True)
#Correct data type of captions column from which to engineer features
df['caption'] = df['caption'].astype(str)
#Correct data type of 'object' columns and fill NaNs with 0
to_int = ['engagement', 'impressions', 'reach', 'saved', 'video_views']
df[to_int] = df[to_int].fillna(0).astype(int)

In [24]:
df.head()

Unnamed: 0,caption,comments,comments_count,media_type,like_count,engagement,impressions,reach,saved,video_views
0,"@CheeseGrotto on a crisp fall eve, in a Manhat...",{'data': [{'timestamp': '2014-11-11T00:20:51+0...,1.0,IMAGE,8.0,0,0,0,0,0
1,"Our #cheesesafe is a beautiful, simple gift fo...",,0.0,IMAGE,6.0,0,0,0,0,0
2,This rich bloomy rind cheese was made in a @Ch...,{'data': [{'timestamp': '2016-09-09T03:22:34+0...,2.0,IMAGE,10.0,0,0,0,0,0
3,Our #cheesesafe with an awesome leather handle...,,0.0,IMAGE,8.0,0,0,0,0,0
4,The goats want to know.....where's the @Cheese...,,0.0,IMAGE,1.0,0,0,0,0,0


## Engineering features
### Hashtag  count:

In [25]:
#Get count of hashtags
def get_ht(line):
    return sum([1 for word in line.split() if word[0] == '#'])

#Get hashtags from the caption column
fr_cap = [get_ht(line) for line in df['caption'].astype(str).values]

#Get hashtags from the first comment in the comments column
#Comments are in reverse chronological order

#Does not include hashtags by commenter:
#fr_com = [get_ht(ast.literal_eval(line)['data'][-1]['text'])\
#                              if type(line) == str else 0 for line in df['comments'].values ]

#This includes hashtags by followers. Presumption is that most are by poster
fr_com = [max([get_ht(comment['text']) for comment in ast.literal_eval(line)['data']])\
                              if type(line) == str else 0 for line in df['comments'].values ]

#Engineer new column for count of hashtags
df['hashtag_count'] = np.array(fr_cap) + np.array(fr_com)

### Caption starts with emoji yes/no

In [26]:
def is_emoji(char):
    #These are the first characters of most emoji
    encodings = ['b\'\\xf0', 'b\'\\xe2']
    #If the begining of this string matches those we know are emoji
    if str(char.encode(encoding = 'UTF-8'))[:6] in encodings:
        return True
    else:
        return False
    
df['begins_with_emoji'] = [True if is_emoji(first_char) else False for first_char in df['caption'].str[0]]

### Caption contains question yes/no

In [27]:
df['has_q'] = df['caption'].str.contains('?', regex = False)

### Caption character count

In [28]:
#Drop hashtags/usernames from the end of each caption before counting up its characters
def cut_tags(line):
    words = line.split()
    #While list of words isn't empty
    while len(words) > 0:
        #If first character of last word is hashtag or username
        if words[-1][0] in ['#', '@']:
            #then drop this word from the list
            words.pop()
        #Otherwise break
        else:
            break
    #If this list of words is not empty now,    
    if len(words) != 0:
        #then return it as a trimmed string
        return ' '.join(map(str, words))
    #Else if the whole caption was just some hashtags, 
    else:
        #then return that caption as it was 
        return line

df['char_count'] = [len(cut_tags(caption)) for caption in df['caption'].values]

In [191]:
df[df['media_type'] == 'VIDEO']

Unnamed: 0,caption,comments,comments_count,media_type,like_count,engagement,impressions,reach,saved,video_views,hashtag_count,begins_with_emoji,has_q,char_count
53,Making that mozz that will be transformed into...,,0.0,VIDEO,6.0,,,,,,2,False,False,144
95,Live jazz and house made cheeses in Brooklyn M...,{'data': [{'timestamp': '2015-04-28T18:48:03+0...,2.0,VIDEO,23.0,,,,,,12,False,False,197
272,Now #ontheblog and #onthemind. Fresh goat disc...,{'data': [{'timestamp': '2015-09-24T04:10:30+0...,6.0,VIDEO,34.0,,,,,,12,False,False,260
325,The people went crazy for the #Cheesecaramels ...,{'data': [{'timestamp': '2015-11-06T06:13:19+0...,2.0,VIDEO,27.0,,,,,,10,False,False,234
344,Engineering the cooling unit for the temperatu...,{'data': [{'timestamp': '2015-11-23T00:38:24+0...,1.0,VIDEO,19.0,,,,,,7,False,False,123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1322,"#Repost @food52 ---\r\n""Welcome to cheese heav...","{""data"":[{""timestamp"":""2019-10-23T00:07:19+000...",4.0,VIDEO,48.0,62,1728,1434,7,434.0,21,False,False,490
1329,"👌 ""This smaller version of your Cheese Grotto ...","{""data"":[{""timestamp"":""2019-11-01T21:01:09+000...",2.0,VIDEO,30.0,34,1084,890,2,219.0,20,True,False,444
1341,"🤩""Your cheese gets its own special house."" We ...","{""data"":[{""timestamp"":""2019-11-24T18:14:48+000...",1.0,VIDEO,37.0,42,1197,1048,4,406.0,30,True,False,113
1346,"🙏For #givingtuesday, we're focusing on all the...","{""data"":[{""timestamp"":""2019-12-05T13:03:09+000...",2.0,VIDEO,75.0,77,1190,1036,0,605.0,2,True,False,363


In [30]:
only_num = df[['comments_count', 'like_count', 'engagement', 'impressions', 'hashtag_count', 'char_count']]

In [41]:
num_engagement = only_num[only_num['engagement'] != 0]

In [37]:
num_impressions = only_num[only_num['impressions'] != 0]

In [39]:
num_impressions.shape

(450, 6)

In [43]:
num_impressions.corr()

Unnamed: 0,comments_count,like_count,engagement,impressions,hashtag_count,char_count
comments_count,1.0,0.095125,0.132555,0.392574,-0.002786,0.156402
like_count,0.095125,1.0,0.997911,0.233482,-0.401967,0.052722
engagement,0.132555,0.997911,1.0,0.259419,-0.402942,0.05667
impressions,0.392574,0.233482,0.259419,1.0,0.063324,0.004422
hashtag_count,-0.002786,-0.401967,-0.402942,0.063324,1.0,-0.038203
char_count,0.156402,0.052722,0.05667,0.004422,-0.038203,1.0


Only a couple numerical features that aren't also measures of engagement/impression. Can I see correlation with tuple features?

In [44]:
#Log-transform to reduce skew and set a similar scale for each feature
def log_tr(sr):
    return pd.Series([0 if x == 0 else np.log(x) for x in sr]) #Excludes '-inf' for log of 0

In [None]:
cols_to_tr = list(features.columns)
cols_to_tr.remove('has_logged_in') #This is a categorical yes/no feature

features_tr = pd.DataFrame({column: log_tr(features[column]) for column in cols_to_tr})
target_tr = log_tr(target)

In [193]:
#USE THIS when ignoring rows that don't have engagement or impressions
df.dropna(axis = 0)

Unnamed: 0,caption,comments,comments_count,media_type,like_count,engagement,impressions,reach,saved,video_views,hashtag_count,begins_with_emoji,has_q,char_count
670,The majestic Stilton in all its glory. #cheese...,{'data': [{'timestamp': '2016-10-18T18:14:03+0...,5.0,IMAGE,99.0,104,0,0,0,0.0,15,False,False,38
671,Cool Grotto shoot today in this beautiful kitc...,{'data': [{'timestamp': '2016-11-04T21:58:07+0...,9.0,IMAGE,58.0,67,0,0,0,0.0,1,False,False,74
672,Make it fall official with L' Amuse Gouda. #ch...,{'data': [{'timestamp': '2016-10-22T00:02:29+0...,7.0,IMAGE,94.0,101,0,0,0,0.0,15,False,False,42
673,"Sometimes, we forget there are people behind t...",{'data': [{'timestamp': '2016-10-21T14:18:23+0...,1.0,IMAGE,37.0,38,0,0,0,0.0,9,False,False,325
674,"Pumpkins are here, the Grotto is coming; oh, w...",{'data': [{'timestamp': '2016-10-26T03:46:08+0...,4.0,IMAGE,81.0,85,0,0,0,0.0,15,False,False,132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1347,👩‍🏫 You ready to take #officeculture up a notc...,"{""data"":[{""timestamp"":""2019-12-04T18:19:01+000...",2.0,CAROUSEL_ALBUM,65.0,71,1244,940,4,0.0,29,True,True,363
1351,💫 Absolutely gorgeous video of our Cheese Grot...,"{""data"":[{""timestamp"":""2019-12-14T04:06:14+000...",4.0,VIDEO,40.0,47,851,789,3,215.0,30,True,False,200
1353,🎄 This weekend is the last chance for two thin...,"{""data"":[{""timestamp"":""2019-12-13T17:06:07+000...",2.0,CAROUSEL_ALBUM,52.0,55,904,700,1,0.0,30,True,False,293
1354,⭐ Looking for that classic #cheesegift for yea...,"{""data"":[{""timestamp"":""2019-12-15T17:27:24+000...",1.0,CAROUSEL_ALBUM,52.0,53,1052,823,0,0.0,31,True,True,266


In [27]:
#FOR TROUBLESHOOTING EMOJI FEATURE

#THIS IS HOW YOU ENCODE AN EMOJI INTO UNICODEb
#test_encode = df['caption'].str[0].astype(str)[1362].encode(encoding = 'UTF-8')


#begins_with_emoji = [is_emoji(first_char) for first_char in df['caption'].str[0].astype(str)]
#comparisons = list(zip(df['caption'].str[0].astype(str), begins_with_emoji))
#Here are the ones that arne't being read as emoji.
#not_emoji = [tup for tup in comparisons if tup[1] == False]

In [None]:
## FROM STORY STORY


#Index the df with datetimes
df.set_index(pd.to_datetime(df['timestamp']), inplace = True)
#Reset time zone to US Eastern time
df.index = df.index.tz_convert('US/Eastern')

#Change remaining columns to int type
df = df.astype(int)

In [None]:
## FROM HASHTAGS HASHTAGS

def get_ht(line):
    return [word[1:] for word in line.split() if word[0] == '#']

#Get hashtags from the caption column
fr_cap = [get_ht(line) for line in df['caption'].astype(str).values]
#Get hashtags from the first comment in the comments column
fr_com = [get_ht(ast.literal_eval(line)['data'][0]['text'])\
                              if type(line) == str else [] for line in df['comments'].values ]
#Extend each list in the 'from captions' list by the values in the 'from comments' list
[fr_cap[i].extend(fr_com_i) for i, fr_com_i in enumerate(fr_com)]

#Add this as a column to the df
df['hashtags'] = fr_cap