# Generic linear regression model

In [1]:
import numpy as np
import pandas as pd
import requests
import time
import matplotlib.pyplot as plt
import seaborn as sns
from pandasql import sqldf #for manipulating DataFrames with SQL style queries
import ast #for literally evaluating strings taht were jsons

In [2]:
df = pd.read_csv('media_objects.csv')
df.drop([0], axis = 0, inplace = True) #Drop column descriptors
df.reset_index(inplace = True)
#Drop unnecessary columns
df.drop(['media_object_id','timestamp', 'media_url'], axis = 1, inplace = True)

In [3]:
df.head()

Unnamed: 0,index,caption,comments,comments_count,media_type,like_count,engagement,impressions,reach,saved,video_views
0,1,"@CheeseGrotto on a crisp fall eve, in a Manhat...",{'data': [{'timestamp': '2014-11-11T00:20:51+0...,1.0,IMAGE,8.0,,,,,
1,2,"Our #cheesesafe is a beautiful, simple gift fo...",,0.0,IMAGE,6.0,,,,,
2,3,This rich bloomy rind cheese was made in a @Ch...,{'data': [{'timestamp': '2016-09-09T03:22:34+0...,2.0,IMAGE,10.0,,,,,
3,4,Our #cheesesafe with an awesome leather handle...,,0.0,IMAGE,8.0,,,,,
4,5,The goats want to know.....where's the @Cheese...,,0.0,IMAGE,1.0,,,,,


### Engineering features

In [23]:
#Get count of hashtags
def get_ht(line):
    return sum([1 for word in line.split() if word[0] == '#'])

#Get hashtags from the caption column
fr_cap = [get_ht(line) for line in df['caption'].astype(str).values]

#Get hashtags from the first comment in the comments column
#Comments are in reverse chronological order

#Does not include hashtags by commenter:
#fr_com = [get_ht(ast.literal_eval(line)['data'][-1]['text'])\
#                              if type(line) == str else 0 for line in df['comments'].values ]

#This includes hashtags by followers. Presumption is that most are by poster
fr_com = [max([get_ht(comment['text']) for comment in ast.literal_eval(line)['data']])\
                              if type(line) == str else 0 for line in df['comments'].values ]

#Engineer new column for count of hashtags
df['hashtag_count'] = np.array(fr_cap) + np.array(fr_com)

In [None]:
#Get yes/no for starting emoji

In [28]:
test_text = df['caption'].values[-3]

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1365 entries, 0 to 1364
Data columns (total 12 columns):
index             1365 non-null int64
caption           1364 non-null object
comments          1181 non-null object
comments_count    1365 non-null float64
media_type        1365 non-null object
like_count        1365 non-null float64
engagement        695 non-null object
impressions       695 non-null object
reach             695 non-null object
saved             695 non-null object
video_views       670 non-null float64
hashtag_count     1365 non-null int32
dtypes: float64(3), int32(1), int64(1), object(7)
memory usage: 122.8+ KB


In [68]:
#THIS IS HOW YOU ENCODE AN EMOJI INTO UNICODEb
test_encode = df['caption'].str[0].astype(str)[1362].encode(encoding = 'UTF-8')

In [76]:
test_encode

b'\xf0\x9f\x8d\x93'

In [69]:
type(test_encode)

bytes

In [79]:
str(test_encode)[:6]

"b'\\xf0"

In [82]:
str(test_encode)[:6] == 'b\'\\xf0'

True

In [91]:
df['caption'].str[0].astype(str)[-10:]

1355    🔎
1356    🎁
1357    🎁
1358    💃
1359    🧀
1360    😎
1361    🤔
1362    🍓
1363    F
1364    🌶
Name: caption, dtype: object

In [87]:
def is_emoji(char):
    if str(char.encode(encoding = 'UTF-8'))[:6] == 'b\'\\xf0':
        return True
    else:
        return False
    
begins_with_emoji = [is_emoji(first_char) for first_char in df['caption'].str[0].astype(str)]

In [88]:
sum(begins_with_emoji)

147

In [96]:
comparisons = list(zip(df['caption'].str[0].astype(str), begins_with_emoji))

In [100]:
#Here are the ones that arne't being read as emoji.
[tup for tup in comparisons if tup[1] == False][-100:]

[('S', False),
 ('T', False),
 ('I', False),
 ('N', False),
 ('D', False),
 ('G', False),
 ('H', False),
 ('I', False),
 ('I', False),
 ('H', False),
 ('W', False),
 ('C', False),
 ('F', False),
 ('H', False),
 ('F', False),
 ('T', False),
 ('N', False),
 ('S', False),
 ('H', False),
 ('I', False),
 ('"', False),
 ('@', False),
 ('G', False),
 ('B', False),
 ('S', False),
 ('T', False),
 ('T', False),
 ('T', False),
 ('H', False),
 ('O', False),
 ('T', False),
 ('O', False),
 ('N', False),
 ('T', False),
 ('S', False),
 ('A', False),
 ('S', False),
 ('A', False),
 ('T', False),
 ('H', False),
 ('T', False),
 ('R', False),
 ('T', False),
 ('K', False),
 ('C', False),
 ('D', False),
 ('O', False),
 ('F', False),
 ('C', False),
 ('H', False),
 ('W', False),
 ('C', False),
 ('T', False),
 ('H', False),
 ('I', False),
 ('R', False),
 ('T', False),
 ('C', False),
 ('T', False),
 ('B', False),
 ('F', False),
 ('W', False),
 ('@', False),
 ('I', False),
 ('O', False),
 ('W', False),
 ('V', Fal

# RESUME HERE FOR ENCODING EMOJI. Some are not being read as emoji yet with the string slice I'm using.

In [None]:
## FROM STORY STORY


#Index the df with datetimes
df.set_index(pd.to_datetime(df['timestamp']), inplace = True)
#Reset time zone to US Eastern time
df.index = df.index.tz_convert('US/Eastern')

#Change remaining columns to int type
df = df.astype(int)

In [None]:
## FROM HASHTAGS HASHTAGS

def get_ht(line):
    return [word[1:] for word in line.split() if word[0] == '#']

#Get hashtags from the caption column
fr_cap = [get_ht(line) for line in df['caption'].astype(str).values]
#Get hashtags from the first comment in the comments column
fr_com = [get_ht(ast.literal_eval(line)['data'][0]['text'])\
                              if type(line) == str else [] for line in df['comments'].values ]
#Extend each list in the 'from captions' list by the values in the 'from comments' list
[fr_cap[i].extend(fr_com_i) for i, fr_com_i in enumerate(fr_com)]

#Add this as a column to the df
df['hashtags'] = fr_cap