# Instagram Post insights analysis

## This is an insight generator tool for Instagram posts

1. Download your Instagram analytic data by going to your Instagram account setting then go to > privacy and security option > Data Download > Request Download.
2. Create the folder "assets" in the root directory of the project.
3. Extract the "past_instagram_insights" folder from this downloaded dataset in step 1 to this "assets" folder from step 2
4. Copy and paste another folder "media" (which should be found in the downloaded dataset from step 2) to the path "assets/past_instagram_insights/"


TODO:

1. Analyze the comments
2. Analyze Videos
   1. Analyze audio from videos and Reels


## Load the json data and convert it to a CSV


In [196]:
import json
import csv
from datetime import datetime

# [convert_json_to_csv] converts the given json file to csv


def convert_json_to_csv(parent_folder_path, file_name, top_json_key):

    # Opening JSON file and loading the data
    # into the variable data
    with open(parent_folder_path+file_name+'.json') as json_file:
        data = json.load(json_file)

    json_data = data[top_json_key]

    # now we will open a file for writing
    data_file = open(parent_folder_path+file_name+'.csv', 'w')

    # create the csv writer object
    csv_writer = csv.writer(data_file)

    # Counter variable used for writing
    # headers to the CSV file
    count = 0

    for data in json_data:
        if count == 0:
            # Writing headers of CSV file
            #header = list(data.keys())
            csv_writer.writerow(
                ['title', 'post_media', 'creation_timestamp', 'post_likes'])
            count += 1

        # Writing data of CSV fileu
        # cell = [data['media_map_data']['Media Thumbnail']['title'], path_to_image_html(parent_folder_path, data['media_map_data']['Media Thumbnail']['uri']),
        #         data['media_map_data']['Media Thumbnail']['creation_timestamp'], data['string_map_data']['Likes']['value']]
        cell = [data['media_map_data']['Media Thumbnail']['title'], data['media_map_data']['Media Thumbnail']['uri'],
                datetime.fromtimestamp(data['media_map_data']['Media Thumbnail']['creation_timestamp']), data['string_map_data']['Likes']['value']]
        csv_writer.writerow(cell)

    data_file.close()


## Call the convert_json_to_csv function to convert json to csv


In [197]:
import pandas as pd

parent_folder_path = 'assets/past_instagram_insights/'
file_name = 'posts'

# Convert the json into CSV
convert_json_to_csv(parent_folder_path, file_name, 'organic_insights_posts')

# Load this saved csv
csv = pd.read_csv(parent_folder_path + file_name+'.csv')


## Load this saved csv


In [198]:
import pandas as pd

# Load the saved csv
df = pd.read_csv(parent_folder_path + file_name+'.csv', index_col=False)

print('Shape: ', df.shape)

# Limit the rows for testing
# df = df.head(15)
df


Shape:  (540, 4)


Unnamed: 0,title,post_media,creation_timestamp,post_likes
0,Python â¢\nTag Your Programmer groups â¢\nâ...,media/posts/201912/69503764_1316105261901993_3...,2019-12-03 21:15:34,19
1,@multiverseapp Follow us @multiverseapp for mo...,media/posts/201912/79984281_434751277472386_56...,2019-12-02 22:33:00,19
2,Beautiful Onboarding animation concept of a 'I...,media/posts/201912/FQIoAkMzLBdAG3bItDlYEBgSZGF...,2019-12-02 22:30:26,30
3,@multiverseapp Follow @multiverseapp for daily...,media/posts/201912/79862581_805719879875319_32...,2019-12-02 22:28:03,19
4,@welovewebdesign by Miroslav SkoÄdopoly\nFoll...,media/posts/201912/78766756_198610067846038_40...,2019-12-02 22:04:20,18
...,...,...,...,...
535,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/147398703_249918756774677_6...,2021-02-06 21:00:30,24
536,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145893246_179494400637566_8...,2021-02-05 21:01:06,18
537,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145403028_1579481252439746_...,2021-02-04 21:00:37,13
538,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145495364_846748032724421_7...,2021-02-03 21:00:35,10


## Fetch hashtags from the title


In [199]:
titles = df.title

all_hashtags = []

# Go theough the title, and saperate out all the hashtags
for i in range(len(titles)):
    # Define an empty hashtag list
    # If no hashtag is found, this empty list is returned
    hashtags_column = []
    title = titles[i]

    if (isinstance(title, str) and '#' in title):
        first_hashtag = title.find('#')
        title = title[first_hashtag:]
        hashtags_column = title.split('#')
        hashtags_column.remove('')

        # remove blank spaces
        for j in range(len(hashtags_column)):
            hashtags_column[j] = hashtags_column[j].replace(' ', '')

    all_hashtags.append(hashtags_column)

# Go through this column to get the most occuring hashtags
hashtags_dict = {}
for hashtag_cell in all_hashtags:
    for single_hashtag in hashtag_cell:
        if single_hashtag not in hashtags_dict:
            hashtags_dict[single_hashtag] = 1
        else:
            hashtags_dict[single_hashtag] += 1

# Rearrange the dict to have the highest occurance first
hashtags_dict = {k: v for k, v in sorted(
    hashtags_dict.items(), key=lambda item: item[1], reverse=True)}

# Get the first n hashtags from the most used hasttags
max_hashtag_count = 5
most_occuring_hashtags = list(hashtags_dict.keys())[:max_hashtag_count]
print('Most occuring hashtags', most_occuring_hashtags)

# Go through hashtags from the dataframe and set the
hashtag_distribution = []
hashtag_distribution

# Add these hashtags as new columns
# add appropriate values for the posts in which these hashtags occur
for row_hashtags in all_hashtags:
    hashtag_distribution_row = []
    for most_occuring_hashtag in most_occuring_hashtags:
        if most_occuring_hashtag in row_hashtags:
            hashtag_distribution_row.append(1)
        else:
            hashtag_distribution_row.append(0)
    hashtag_distribution.append(hashtag_distribution_row)

# Add this occurance frequency to the dataframe
for most_occuring_hashtag in most_occuring_hashtags:
    df[most_occuring_hashtags] = hashtag_distribution

df


Most occuring hashtags ['uitrends', 'appdesign', 'mobiledesign', 'dailyui', 'userinterfacedesign']


Unnamed: 0,title,post_media,creation_timestamp,post_likes,uitrends,appdesign,mobiledesign,dailyui,userinterfacedesign
0,Python â¢\nTag Your Programmer groups â¢\nâ...,media/posts/201912/69503764_1316105261901993_3...,2019-12-03 21:15:34,19,0,0,0,0,0
1,@multiverseapp Follow us @multiverseapp for mo...,media/posts/201912/79984281_434751277472386_56...,2019-12-02 22:33:00,19,1,1,1,0,1
2,Beautiful Onboarding animation concept of a 'I...,media/posts/201912/FQIoAkMzLBdAG3bItDlYEBgSZGF...,2019-12-02 22:30:26,30,0,0,0,0,0
3,@multiverseapp Follow @multiverseapp for daily...,media/posts/201912/79862581_805719879875319_32...,2019-12-02 22:28:03,19,0,0,0,0,0
4,@welovewebdesign by Miroslav SkoÄdopoly\nFoll...,media/posts/201912/78766756_198610067846038_40...,2019-12-02 22:04:20,18,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
535,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/147398703_249918756774677_6...,2021-02-06 21:00:30,24,1,1,1,1,1
536,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145893246_179494400637566_8...,2021-02-05 21:01:06,18,1,1,1,1,1
537,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145403028_1579481252439746_...,2021-02-04 21:00:37,13,1,1,1,1,1
538,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145495364_846748032724421_7...,2021-02-03 21:00:35,10,1,1,1,1,1


## Saperate all the date feilds


In [200]:
dates = df.creation_timestamp

weekdays = []
times = []

for i in range(len(dates)):
    date = datetime.fromisoformat(dates[i])
    # Add day of the week as an integer, where Monday is 0 and Sunday is 6.
    weekdays.append(date.weekday())

    # Add time as a float
    times.append(date.time().hour + (date.time().minute / 60))

df['post_weekday'] = weekdays
df['post_time'] = times


## Get Dominant color from an image


In [201]:
from colorthief import ColorThief
import webcolors

# To find the closest colour name https://stackoverflow.com/a/9694246/6559381


def closest_colour(requested_colour):
    min_colours = {}
    for key, name in webcolors.css3_hex_to_names.items():
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - requested_colour[0]) ** 2
        gd = (g_c - requested_colour[1]) ** 2
        bd = (b_c - requested_colour[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]


post_media = df.post_media

dominant_colors = []

for i in range(len(post_media)):
    # Get the image from the list of images
    image = post_media[i]

    # Get the dominant color
    print('Finding the dominant color in the image: ', i)
    if '.jpg' in image:
        color_thief = ColorThief(parent_folder_path+image)
        dominant_color = color_thief.get_color(quality=1)
        dominant_colors.append(closest_colour((dominant_color)))
    else:
        dominant_colors.append('video')

df['post_dominant_colors'] = dominant_colors
df


Finding the dominant color in the image:  0
Finding the dominant color in the image:  1
Finding the dominant color in the image:  2
Finding the dominant color in the image:  3
Finding the dominant color in the image:  4
Finding the dominant color in the image:  5
Finding the dominant color in the image:  6
Finding the dominant color in the image:  7
Finding the dominant color in the image:  8
Finding the dominant color in the image:  9
Finding the dominant color in the image:  10
Finding the dominant color in the image:  11
Finding the dominant color in the image:  12
Finding the dominant color in the image:  13
Finding the dominant color in the image:  14
Finding the dominant color in the image:  15
Finding the dominant color in the image:  16
Finding the dominant color in the image:  17
Finding the dominant color in the image:  18
Finding the dominant color in the image:  19
Finding the dominant color in the image:  20
Finding the dominant color in the image:  21
Finding the dominant

Unnamed: 0,title,post_media,creation_timestamp,post_likes,uitrends,appdesign,mobiledesign,dailyui,userinterfacedesign,post_weekday,post_time,post_dominant_colors
0,Python â¢\nTag Your Programmer groups â¢\nâ...,media/posts/201912/69503764_1316105261901993_3...,2019-12-03 21:15:34,19,0,0,0,0,0,1,21.250000,black
1,@multiverseapp Follow us @multiverseapp for mo...,media/posts/201912/79984281_434751277472386_56...,2019-12-02 22:33:00,19,1,1,1,0,1,0,22.550000,video
2,Beautiful Onboarding animation concept of a 'I...,media/posts/201912/FQIoAkMzLBdAG3bItDlYEBgSZGF...,2019-12-02 22:30:26,30,0,0,0,0,0,0,22.500000,video
3,@multiverseapp Follow @multiverseapp for daily...,media/posts/201912/79862581_805719879875319_32...,2019-12-02 22:28:03,19,0,0,0,0,0,0,22.466667,video
4,@welovewebdesign by Miroslav SkoÄdopoly\nFoll...,media/posts/201912/78766756_198610067846038_40...,2019-12-02 22:04:20,18,0,0,0,0,0,0,22.066667,darkslategrey
...,...,...,...,...,...,...,...,...,...,...,...,...
535,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/147398703_249918756774677_6...,2021-02-06 21:00:30,24,1,1,1,1,1,5,21.000000,mistyrose
536,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145893246_179494400637566_8...,2021-02-05 21:01:06,18,1,1,1,1,1,4,21.016667,bisque
537,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145403028_1579481252439746_...,2021-02-04 21:00:37,13,1,1,1,1,1,3,21.000000,bisque
538,"ð¥ We are open for Design work!\nð© DM us,...",media/posts/202102/145495364_846748032724421_7...,2021-02-03 21:00:35,10,1,1,1,1,1,2,21.000000,linen


## Drop unwanted columns


In [202]:
if df.__contains__('index'):
    df.reset_index(drop=True, inplace=True)
if df.__contains__('title'):
    df.drop(columns=['title'], inplace=True)
if df.__contains__('creation_timestamp'):
    df.drop(columns=['creation_timestamp'], inplace=True)
if df.__contains__('post_media'):
    df.drop(columns=['post_media'], inplace=True)
if df.__contains__('hashtags'):
    df.drop(columns=['hashtags'], inplace=True)
df


Unnamed: 0,post_likes,uitrends,appdesign,mobiledesign,dailyui,userinterfacedesign,post_weekday,post_time,post_dominant_colors
0,19,0,0,0,0,0,1,21.250000,black
1,19,1,1,1,0,1,0,22.550000,video
2,30,0,0,0,0,0,0,22.500000,video
3,19,0,0,0,0,0,0,22.466667,video
4,18,0,0,0,0,0,0,22.066667,darkslategrey
...,...,...,...,...,...,...,...,...,...
535,24,1,1,1,1,1,5,21.000000,mistyrose
536,18,1,1,1,1,1,4,21.016667,bisque
537,13,1,1,1,1,1,3,21.000000,bisque
538,10,1,1,1,1,1,2,21.000000,linen


## Save this clean Dataset

In [203]:
df.to_csv(parent_folder_path+file_name+'_clean.csv')


## Visualise the most liked Data

From the entire dataset, show the charecteristics of the top n most liked posts

we will use _sweetviz_ to visualize the data


In [204]:
import sweetviz as sv

# Creates the graph of the top n (index_to_truncate_from) most liked posts


def visualize_top_n_most(index_to_truncate_from):
    df_report = df.sort_values(
        by='post_likes', ascending=False).head(index_to_truncate_from)
    sv.analyze(df_report).show_html()


# Creates the report graph of the top 10 most liked posts
visualize_top_n_most(10)


Done! Use 'show' commands to display/save.   |██████████| [100%]   00:01 -> (00:00 left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.





## Encode categorical data


In [205]:
from sklearn.preprocessing import LabelEncoder

le_post_dominant_colors = LabelEncoder()
df['post_dominant_colors'] = le_post_dominant_colors.fit_transform(
    df['post_dominant_colors'])

df

Unnamed: 0,post_likes,uitrends,appdesign,mobiledesign,dailyui,userinterfacedesign,post_weekday,post_time,post_dominant_colors
0,19,0,0,0,0,0,1,21.250000,4
1,19,1,1,1,0,1,0,22.550000,48
2,30,0,0,0,0,0,0,22.500000,48
3,19,0,0,0,0,0,0,22.466667,48
4,18,0,0,0,0,0,0,22.066667,10
...,...,...,...,...,...,...,...,...,...
535,24,1,1,1,1,1,5,21.000000,30
536,18,1,1,1,1,1,4,21.016667,3
537,13,1,1,1,1,1,3,21.000000,3
538,10,1,1,1,1,1,2,21.000000,24


## Create Training data sets


In [206]:
from sklearn.model_selection import train_test_split

# Let's extract features data assigning to X and labels data assigning to:
X = pd.DataFrame(df.values, columns=df.columns)
y = pd.DataFrame(df['post_likes'], columns=["post_likes"])

(X_train, X_tmp, y_train, y_tmp) = train_test_split(
    X, y, train_size=0.7, random_state=1)
(X_test, X_val, y_test, y_val) = train_test_split(
    X_tmp, y_tmp, train_size=0.6, random_state=1)


print('Training data shape: ', X_train.shape)
print('Validation data shape: ', X_val.shape)
print('Testing data shape: ', X_test.shape)


Training data shape:  (378, 9)
Validation data shape:  (65, 9)
Testing data shape:  (97, 9)


## Train Decision Tree


In [207]:
from IPython.display import Image
from subprocess import call
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier

# Defining and fitting a DecisionTreeClassifier instance
tree1 = DecisionTreeClassifier()
tree1.fit(X_train, y_train)


DecisionTreeClassifier()

## Validate the Decision tee


In [208]:
tree1.score(X_val, y_val)


0.8615384615384616

## Test the Decision tee


In [211]:
tree1.score(X_test, y_test)


0.8865979381443299

## Make a Prediction

Now that we know how our decision tree works, let us make predictions.


In [210]:
#sample_one_pred_tree1 = int(tree1.predict([[5, 5, 1, 3]]))