# Generic linear regression model

In [3]:
import numpy as np
import pandas as pd
import requests
import time
import matplotlib.pyplot as plt
import seaborn as sns
from pandasql import sqldf #for manipulating DataFrames with SQL style queries
import ast #for literally evaluating strings taht were jsons

In [10]:
df = pd.read_csv('media_objects.csv')
df.drop([0], axis = 0, inplace = True) #Drop column descriptors
#Drop unnecessary columns
df.drop(['media_object_id','timestamp', 'media_url'], axis = 1, inplace = True)

In [8]:
#RESET INDEX LATER

In [11]:
df.head()

Unnamed: 0,caption,comments,comments_count,media_type,like_count,engagement,impressions,reach,saved,video_views
1,"@CheeseGrotto on a crisp fall eve, in a Manhat...",{'data': [{'timestamp': '2014-11-11T00:20:51+0...,1.0,IMAGE,8.0,,,,,
2,"Our #cheesesafe is a beautiful, simple gift fo...",,0.0,IMAGE,6.0,,,,,
3,This rich bloomy rind cheese was made in a @Ch...,{'data': [{'timestamp': '2016-09-09T03:22:34+0...,2.0,IMAGE,10.0,,,,,
4,Our #cheesesafe with an awesome leather handle...,,0.0,IMAGE,8.0,,,,,
5,The goats want to know.....where's the @Cheese...,,0.0,IMAGE,1.0,,,,,


### Engineering features

In [6]:
#Get count of hashtags
def get_ht(line):
    return sum([1 for word in line.split() if word[0] == '#'])

#Get hashtags from the caption column
fr_cap = [get_ht(line) for line in df['caption'].astype(str).values]

#Get hashtags from the first comment in the comments column
#Comments are in reverse chronological order

#Does not include hashtags by commenter:
#fr_com = [get_ht(ast.literal_eval(line)['data'][-1]['text'])\
#                              if type(line) == str else 0 for line in df['comments'].values ]

#This includes hashtags by followers. Presumption is that most are by poster
fr_com = [sum([get_ht(comment['text']) for comment in ast.literal_eval(line)['data']])\
                              if type(line) == str else 0 for line in df['comments'].values ]

df['hashtag_count'] = np.array(fr_cap) + np.array(fr_com)

In [None]:


#Extend each list in the 'from captions' list by the values in the 'from comments' list
[fr_cap[i].extend(fr_com_i) for i, fr_com_i in enumerate(fr_com)]

#Add this as a column to the df
df['hashtags'] = fr_cap

In [None]:
## FROM STORY STORY


#Index the df with datetimes
df.set_index(pd.to_datetime(df['timestamp']), inplace = True)
#Reset time zone to US Eastern time
df.index = df.index.tz_convert('US/Eastern')

#Change remaining columns to int type
df = df.astype(int)

In [None]:
## FROM HASHTAGS HASHTAGS

def get_ht(line):
    return [word[1:] for word in line.split() if word[0] == '#']

#Get hashtags from the caption column
fr_cap = [get_ht(line) for line in df['caption'].astype(str).values]
#Get hashtags from the first comment in the comments column
fr_com = [get_ht(ast.literal_eval(line)['data'][0]['text'])\
                              if type(line) == str else [] for line in df['comments'].values ]
#Extend each list in the 'from captions' list by the values in the 'from comments' list
[fr_cap[i].extend(fr_com_i) for i, fr_com_i in enumerate(fr_com)]

#Add this as a column to the df
df['hashtags'] = fr_cap