# Change Working Directory

In [102]:
import pandas as pd
import os
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
abspath = os.path.abspath('OO_Importing_Data.py') # Get filepath
dname = os.path.dirname(abspath) # Get directory
os.chdir(dname) # Make directory working directory

# EDA

## Reading in the Data

In [3]:
data = pd.read_json('assets/newdump.json')

## Splitting up the 'Channel Info' dictionaries into seperate columns

In [4]:
data['channel_type'] = [x['type'] for x in data['channel_info']]
data['channel'] = [x['channel'] for x in data['channel_info']]

In [5]:
data.drop('channel_info', axis = 1, inplace=True)

## Showing only Facebook and Instagram Data

In [6]:
FB_and_IG_data = data.loc[(data['channel'] == 'facebook') | (data['channel'] == 'instagram')]

## Removing '' and [] from 'type' column for queries

In [7]:
FB_and_IG_data['channel_type'] = FB_and_IG_data['channel_type'].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Breaking down number of entries for each type of post. Looks like Facebook is a clear winner

In [8]:
FB_and_IG_data['type'].value_counts()

facebook post    249485
instagram pic     17526
instagram vid      2664
Name: type, dtype: int64

## Replacing Values in 'brand' with the actual publication

In [9]:
FB_and_IG_data['brand'].value_counts()

137322    46545
137326    37444
137329    32608
137299    31680
137316    28308
137325    25983
137321    24165
137314    22100
137300    20842
Name: brand, dtype: int64

Found these by plugging urls into google and seeing what showed up
* Brand 137314 = Conde Naste Traveler
* Brand 137329 = W Magazine
* Brand 137321 = OnSelf Magazine
* Brand 137325 = Vanity Fair
* Brand 137300 = Clever
* Brand 137322 = Teen Vogue
* Brand 137299 = Allure
* Brand 137326 = Vogue
* Brand 137316 = Glamor

In [10]:
brands = {137314 : 'Conde_Naste_Traveler', 
          137329 : 'W_Magazine',
          137321 : 'Onself',
          137325 : 'Vanity_Fair', 
          137300 : 'Clever', 
          137322 : 'Teen_Vogue', 
          137299 : 'Allure', 
          137326 : 'Vogue',137316 : 'Glamor'
         }
FB_and_IG_data['brand'] = FB_and_IG_data['brand'].map(brands)                                                     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


## Taking a subset of just the Instagram data

In [11]:
instagram = FB_and_IG_data.loc[FB_and_IG_data['type'].isin(['instagram pic', 'instagram vid'])]

In [12]:
instagram = instagram.reset_index(drop=True)

## Getting data from the content column

In [13]:
instacontent = instagram['content']

In [14]:
sum([instacontent[i].keys() == instacontent[0].keys() for i in range(len(instacontent))])
#Seeing how many image entries we have.

17526

In [15]:
sum([instacontent[i].keys() != instacontent[0].keys() for i in range(len(instacontent))])
#Seeing how many video entries we have.

2664

## Splitting Instagram data into seperate image and video frames

In [16]:
instapics = instagram.loc[(instagram['type'] == 'instagram pic')]
instapics = instapics.reset_index(drop=True)

In [17]:
instavids = instagram.loc[(instagram['type'] == 'instagram vid')]
instavids = instavids.reset_index(drop=True)

In [18]:
instapicscontent = instapics['content']

In [19]:
picframe = pd.DataFrame.from_dict(instapicscontent)

In [20]:
instavidscontent = instavids['content']

In [21]:
vidframe = pd.DataFrame.from_dict(instavidscontent)

In [22]:
picframe['caption']  = [x['caption'] for x in instapics['content']]
picframe['comment_count']  = [x['comment_count'] for x in instapics['content']]
picframe['filter_name'] = [x['filter_name'] for x in instapics['content']]
picframe['hashtags'] = [x['hashtags'] for x in instapics['content']]
picframe['image_url'] = [x['image_url'] for x in instapics['content']]
picframe['like_count'] = [x['like_count'] for x in instapics['content']]
picframe['link'] = [x['link'] for x in instapics['content']]
picframe['links'] = [x['links'] for x in instapics['content']]
picframe['post_id'] = [x['post_id'] for x in instapics['content']]

In [23]:
picframe.drop(['content'], axis = 1, inplace = True)
picframe = picframe.reset_index(drop=True)
picframe.head()

Unnamed: 0,caption,comment_count,filter_name,hashtags,image_url,like_count,link,links,post_id
0,@JLo & @ARod isn't just another love story; it...,164,Normal,,https://scontent.cdninstagram.com/t51.2885-15/...,22678,https://www.instagram.com/p/BbikmregLBQ/,[],1649041401268187216_11206038
1,We're still not over all the cute moments 😍 a...,3,Normal,GlamourWOTY,https://scontent.cdninstagram.com/t51.2885-15/...,512,https://www.instagram.com/p/BbibiLwDKPX/,[],1649001509906326487_10070230
2,"It's no secret how much we love makeup, and if...",11,Normal,EditorsFaves,https://scontent.cdninstagram.com/t51.2885-15/...,1117,https://www.instagram.com/p/BbiXLd9BWmc/,[],1648982356717300124_24852591
3,Amber Heard has one simple trick for narrowing...,101,Normal,,https://scontent.cdninstagram.com/t51.2885-15/...,16608,https://www.instagram.com/p/BbiQFGHgTMh/,[],1648951132480942881_11206038
4,By painting the home office of a Nolita loft b...,37,Normal,linkinbio,https://scontent.cdninstagram.com/t51.2885-15/...,6508,https://www.instagram.com/p/BbiLW7MHu6Y/,[],1648930367465254552_239180867


In [125]:
picframe.links.value_counts

<bound method IndexOpsMixin.value_counts of 0                 []
1                 []
2                 []
3                 []
4                 []
5                 []
6                 []
7         [wmag.com]
8                 []
9                 []
10                []
11                []
12                []
13                []
14                []
15                []
16                []
17                []
18                []
19                []
20                []
21                []
22                []
23                []
24                []
25                []
26                []
27                []
28                []
29                []
            ...     
17496             []
17497             []
17498             []
17499    [wmag.com.]
17500             []
17501             []
17502    [a:Beacon,]
17503             []
17504             []
17505             []
17506             []
17507             []
17508             []
17509             []
17510      

In [127]:
re.sub(r'[^\x00-\x7F]', '', picframe.links)

TypeError: expected string or bytes-like object

In [24]:
vidframe['caption'] = [x['caption'] for x in instavids['content']]
vidframe['comment_count'] = [x['comment_count'] for x in instavids['content']]
vidframe['filter_name'] = [x['filter_name'] for x in instavids['content']]
vidframe['hashtags'] = [x['hashtags'] for x in instavids['content']]
vidframe['image_url'] = [x['image_url'] for x in instavids['content']]
vidframe['like_count'] = [x['like_count'] for x in instavids['content']]
vidframe['link'] = [x['link'] for x in instavids['content']]
vidframe['links'] = [x['links'] for x in instavids['content']]
vidframe['post_id'] = [x['post_id'] for x in instavids['content']]
vidframe['video_url'] = [x['video_url'] for x in instavids['content']]

In [25]:
vidframe.drop(['content'], axis = 1, inplace = True)
vidframe = vidframe.reset_index(drop=True)
vidframe.head()

Unnamed: 0,caption,comment_count,filter_name,hashtags,image_url,like_count,link,links,post_id,video_url
0,We asked people to explain in their own words ...,1,Normal,linkinbio health wellness birthcontrol,https://scontent.cdninstagram.com/t51.2885-15/...,103,https://www.instagram.com/p/BbiFr_fHgFs/,[],1648905426908873068_13622784,https://scontent.cdninstagram.com/t50.2886-16/...
1,@traceeellisross breaks down in a powerful tal...,110,Normal,linkinbio glamourwoty,https://scontent.cdninstagram.com/t51.2885-15/...,3750,https://www.instagram.com/p/Bbh6nihDSkb/,[],1648856742413281563_10070230,https://scontent.cdninstagram.com/t50.2886-16/...
2,@carmenelectra has a closet full of memories. ...,78,Normal,,https://scontent.cdninstagram.com/t51.2885-15/...,13677,https://www.instagram.com/p/BbhQME6ADHH/,[],1648670137240269255_198154074,https://scontent.cdninstagram.com/t50.2886-16/...
3,Just take all of my money... 💸💄,540,Normal,,https://scontent.cdninstagram.com/t51.2885-15/...,28393,https://www.instagram.com/p/BbhFhVan5qm/,[],1648623219564845734_4073479,https://scontent.cdninstagram.com/t50.2886-16/...
4,"What's @fullfrontalsamb's superpower? ""Super t...",23,Normal,glamourwoty,https://scontent.cdninstagram.com/t51.2885-15/...,1133,https://www.instagram.com/p/Bbf_hXfDXnh/,[],1648315358530861537_10070230,https://scontent.cdninstagram.com/t50.2886-16/...


In [90]:
pichash = picframe['hashtags'].value_counts()
pichash = pichash.iloc[1:]
pichash = pichash.astype(str)

In [91]:
vidhash = vidframe['hashtags'].value_counts()
vidhash = vidhash.iloc[1:]
vidhash = vidhash.astype(str)

In [94]:
# from sklearn.feature_extraction.text import CountVectorizer

# picvect = CountVectorizer(analyzer = "word",   
#                              tokenizer = None,    
#                              preprocessor = None, 
#                              stop_words = 'english',   
#                              max_features = 10000,
#                              min_df = 1)

# vidvect = CountVectorizer(analyzer = "word",   
#                              tokenizer = None,    
#                              preprocessor = None, 
#                              stop_words = 'english',   
#                              max_features = 10000,
#                              min_df = 1)

In [95]:
# picthashcorp = picvect.fit_transform(pichash)
# pichashcorparray = picthashcorp.toarray()

# vidhashcorp = picvect.fit_transform(vidhash)
# vidhashcorparray = vidhashcorp.toarray()

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# picclf = KNeighborsClassifier(n_neighbors=5)
# picclf.fit(pichashcorparray, ) #Needs a y value

# Modeling Functions (WORK IN PROGRESS!)

In [99]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

def InstaVect(wordseries, Y):
    try:
        wordseries = wordseries.astype(str)
    except ValueError:
        print("ERROR: Entries must be string or list of strings.")
    
    vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = 'english',   
                             max_features = 10000,
                             min_df = 1)
    corpus = vectorizer.fit_transform(wordseries)
    corpus = corpus.to_array
    
    classifier = KNeighborsClassifier(n_neighbors=5)
    
    return classifier.fit(corpus, y=Y)

#Don't know if this works

In [117]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn import datasets, linear_model

def SLR(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    lm = linear_model.LinearRegression()
    model = lm.fit(X_train, y_train)
    predictions = lm.predict(X_test)
        
    scores = cross_val_score(model, X_train, y_train, cv=6)
    predictions = cross_val_predict(model, df, y, cv=6)
    accuracy = metrics.r2_score(y, predictions)
    
    plt.scatter(y, predictions)
    plt.show()    
    
    print("Cross-Predicted Accuracy:", accuracy)
    
#Ditto. Getting an array error. Hate that one.