In [1]:
# Mounting colab on drive
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
# Enter the folder name
FOLDER_NAME = '/content/drive/My Drive/IndianData/Posts/'

Mounted at /content/drive


In [None]:
# Extracting files from JSON

import os
import json

countF = 0
countI = 0

InstaPostList = []

for folder in os.listdir(FOLDER_NAME):
    if folder==".ipynb_checkpoints":
        continue
    foldername = FOLDER_NAME + folder
    print(foldername)
    for file in os.listdir(foldername):
        if file==".ipynb_checkpoints":
            continue
        with open(foldername+"/"+file, "r") as f:
            posts = f.readlines()
            for post in posts:
                post = json.loads(post)
                try:
                    for postF in post['result']['posts']:
                        if postF['platform']=="Facebook":
                            InstaPostList.append(postF)
                except:
                    continue

In [None]:
# Removing duplicate posts

IdDict = set()
newList = []

for post in InstaPostList:
    if post['platformId'] in IdDict:
        continue
    else:
        newList.append(post)
        IdDict.add(post['platformId'])

InstaPostList = newList

In [None]:
# Monthly Post analysis
from datetime import datetime

MonthlyPost = dict()

for post in InstaPostList:
    month = datetime.strptime(post['date'], "%Y-%m-%d %H:%M:%S")
    month = "{}-{}-01".format(month.year, month.month)
    if month in MonthlyPost:
        MonthlyPost[month] += 1
    else:
        MonthlyPost[month] = 0

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")

In [None]:
MonthlyPostDF = pd.DataFrame({
    "Month" : [datetime.strptime(date, "%Y-%m-%d") for date in MonthlyPost.keys()],
    "Number of Posts" : [val for val in MonthlyPost.values()]
})
plt.figure(figsize = (15,5))
sns.lineplot(x="Month", y="Number of Posts", data=MonthlyPostDF)

In [None]:
# Popular accounts/pages

Accounts = dict()

for post in InstaPostList:
    if post['account']['name'] not in Accounts.keys():
        Accounts[post['account']['name']] = {
            "subscriberCount" : post['account']['subscriberCount'],
            "postCount" : 0
        }
    Accounts[post['account']['name']]['subscriberCount'] = max(Accounts[post['account']['name']]['subscriberCount'], post['account']['subscriberCount'])
    Accounts[post['account']['name']]['postCount'] += 1

In [None]:
AccountsSubs = {k: v for k, v in sorted(Accounts.items(), key=lambda item: item[1]['subscriberCount'], reverse=True)[:15]}

for accn in AccountsSubs.keys():
    print("Account - {}, Subscriber Count - {}, Post Count - {}".format(accn, AccountsSubs[accn]['subscriberCount'], AccountsSubs[accn]['postCount']))

In [None]:
AccountsPosts = {k: v for k, v in sorted(Accounts.items(), key=lambda item: item[1]['postCount'], reverse=True)[:15]}

for accn in AccountsPosts.keys():
    print("Account - {}, Subscriber Count - {}, Post Count - {}".format(accn, AccountsPosts[accn]['subscriberCount'], AccountsPosts[accn]['postCount']))

In [None]:
# Overall word analysis

from gensim.parsing.preprocessing import remove_stopwords
import re

wordCount = dict()

for post in InstaPostList:
    try:
        text = post['message']
    except:
        try:
            text = post['description']
        except:
            continue 
    text = re.sub(r'http\S+', '', text)
    text = text.replace('\n'," ")
    text = text.replace('\t'," ")
    text = text.lower()
    res = re.sub(r'[^\w\s]', ' ', text)
    res = remove_stopwords(res)
    words = res.split()
    for word in words:
        if(len(word) <= 1):
            continue
        if word not in wordCount.keys():
            wordCount[word] = 0
        wordCount[word] += 1

In [None]:
wordCountSorted = sorted(wordCount, key=wordCount.get, reverse=True)

In [None]:
wordCountSorted[:15]

In [None]:
# Word cloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud

txt = ""
for word in wordCount.keys():
    for i in range(wordCount[word]):
        txt += word + " "

plt.figure(figsize = (5,5))
word_cloud = WordCloud(collocations = False, background_color = 'white', max_words=100).generate(txt)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Hashtag freuqency
import re

all_hashtags = dict()
hashtag_re = re.compile("#\w+ ")

for post in InstaPostList:
    try:
        line = post['message']
    except:
        try:
            line = post['description']
        except:
            continue 
    try:
        line = line.strip();
        line = line.lower();
    except:
        continue;
    tweet = line;
    hashtags = re.findall(hashtag_re,tweet);
    if(len(hashtags)>0):
        for hashtag in hashtags:
            hashtag = hashtag.strip();
            if(len(hashtag)<3):
                continue;
            if hashtag in all_hashtags.keys():
                all_hashtags[hashtag] += 1
            else:
                all_hashtags[hashtag] = 1

In [None]:
all_hashtags = {k:v for k,v in sorted(all_hashtags.items(), key=lambda item: item[1], reverse=True)}

In [None]:
all_hashtags

In [None]:
# URL Frequency
URLs = dict()

for post in InstaPostList:
  try:
    post = post['expandedLinks']

    for link in post:
      url = link['original']
      if url not in URLs.keys():
        URLs[url] = 0
      URLs[url] += 1
  except:
    continue

In [None]:
URLs = {k:v for k,v in sorted(URLs.items(), key=lambda item:item[1], reverse=True)}

In [None]:
URLs

In [None]:
# Reach analysis
shareCount = []
CommentCount = []
loveCount = []
wowCount = []
hahaCount = []
sadCount = []
angryCount = []
thankfulCount = []
careCount = []


for post in InstaPostList:
  try:
    shareCount.append(post['statistics']['actual']['shareCount'])
    loveCount.append(post['statistics']['actual']['loveCount'])
    wowCount.append(post['statistics']['actual']['wowCount'])
    hahaCount.append(post['statistics']['actual']['hahaCount'])
    sadCount.append(post['statistics']['actual']['sadCount'])
    angryCount.append(post['statistics']['actual']['angryCount'])
    thankfulCount.append(post['statistics']['actual']['thankfulCount'])
    careCount.append(post['statistics']['actual']['careCount'])
    CommentCount.append(post['statistics']['actual']['commentCount'])
  except:
    continue
    
import statistics 

print("Share count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(shareCount), statistics.median(shareCount), statistics.stdev(shareCount), max(shareCount)))
print("Comment count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(CommentCount), statistics.median(CommentCount), statistics.stdev(CommentCount), max(CommentCount)))
print("Love count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(loveCount), statistics.median(loveCount), statistics.stdev(loveCount), max(loveCount)))
print("Wow count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(wowCount), statistics.median(wowCount), statistics.stdev(wowCount), max(wowCount)))
print("Sad count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(hahaCount), statistics.median(hahaCount), statistics.stdev(hahaCount), max(hahaCount)))
print("Angry count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(sadCount), statistics.median(sadCount), statistics.stdev(sadCount), max(sadCount)))
print("Thankful count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(angryCount), statistics.median(angryCount), statistics.stdev(angryCount), max(angryCount)))
print("Care count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(thankfulCount), statistics.median(thankfulCount), statistics.stdev(thankfulCount), max(thankfulCount)))
print("Commend count - Mean = {}, Median = {}, Standard Deviation = {}, Max = {}".format(statistics.mean(careCount), statistics.median(careCount), statistics.stdev(careCount), max(careCount)))

In [None]:
# Word vector analysis

from gensim.models import Word2Vec
import os
import json
import re
import matplotlib.pyplot as plt
from gensim.parsing.preprocessing import remove_stopwords
import nltk

nltk.download('punkt')

In [None]:
sentences = []

for post in InstaPostList:
    try:
        text = post['message']
    except:
        try:
            text = post['description']
        except:
            continue 
    text = re.sub(r'http\S+', '', text)
    text = text.replace('\n'," ")
    text = text.replace('\t'," ")
    text = text.lower()
    res = re.sub(r'[^\w\s]', ' ', text)
    res = remove_stopwords(res)
    txt = []
    words = res.split()
    for word in words:
        if(len(word) <= 1):
            continue
        txt.append(word)
    sentences.append(txt)

In [None]:
model = Word2Vec(sentences=sentences, window=5, min_count=1, workers=4)

In [None]:
IntialWordList = ["asian", "virus", "coronavirus", "chinese"]

In [None]:
for word in IntialWordList:
    sims = model.wv.most_similar(word, topn=10)
    print("----------------------------------------------------")
    print("Word similar to {}: ".format(word))
    for word, score in sims:
        print("Word - {}, Similarity score - {}".format(word, score))
    print("----------------------------------------------------")

In [None]:
import networkx as nx
from numpy import dot
from numpy.linalg import norm
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain

In [None]:
!pip3 install python-louvain

In [None]:
numLevel = 2
thres = 0.5

G = nx.Graph()

def makeGraph(words, lev=2):
    for word in words:
        if word not in G.nodes():
            G.add_node(word)
            sims = model.wv.most_similar(word, topn=10)
            newWordList = []
            for wordN,sim in sims:
                if sim>thres:
                    newWordList.append(wordN)
            if lev>0:
                makeGraph(newWordList, lev-1)
                for wordN, sim in sims:
                    if sim>thres:
                        G.add_edge(word, wordN, weight=sim)
    return

In [None]:
makeGraph(IntialWordList)
G = G.to_undirected()
partitionLev2 = community_louvain.best_partition(G)

In [None]:
ns = []
for node in G.nodes():
    ns.append(wordCount[node])
mns = statistics.mean(ns)
for i in range(len(ns)):
    ns[i] /= mns

In [None]:
pos = nx.spring_layout(G)
plt.figure(figsize = (25,25))
# color the nodes according to their partition
cmap = cm.get_cmap('viridis', max(partitionLev2.values()) + 1)
nx.draw_networkx_nodes(G, pos, partitionLev2.keys(), node_size=[100*x for x in ns], cmap=cmap, node_color=list(partitionLev2.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=10, horizontalalignment="right")
plt.show()

In [None]:
# Topic modelling

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
import re
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import remove_stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


data_words = []
for post in InstaPostList:
    try:
        text = post['message']
    except:
        try:
            text = post['description']
        except:
            continue 
    text = re.sub(r'http\S+', '', text)
    res = re.sub(r'[^\w\s]', ' ', text)
    res = remove_stopwords(res)
    data_word = simple_preprocess(res)
    data_word = [word for word in data_word if word not in stop_words]
    data_words.append(data_word)

In [None]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
from pprint import pprint
import warnings
# number of topics
num_topics = 10
# Build LDA model
warnings.simplefilter("ignore")
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                    id2word=id2word,
                                    num_topics=num_topics)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

In [None]:
# Hashtag co-ocurrence analysis

currhashs = [k for (k,v) in all_hashtags.items() if v>200]

CorrCount = dict()
for hashtag in currhashs:
    CorrCount[hashtag] = dict()
    for hashtag2 in currhashs:
        if hashtag!=hashtag2:
            CorrCount[hashtag][hashtag2] = 0

In [None]:
hashtag_re = re.compile("#\w+ ")

for post in InstaPostList:
    try:
        line = post['message']
    except:
        try:
            line = post['description']
        except:
            continue 
    try:
        line = line.strip();
        line = line.lower();
    except:
        continue;
    tweet = line;
    hashtags = re.findall(hashtag_re,tweet);
    if(len(hashtags)>0):
        for hashtag in hashtags:
            hashorig = hashtag
            hashtag = hashtag.strip();
            if len(hashtag)<3:
                hashtags.remove(hashorig)
        for hashtag1 in hashtags:
            for hashtag2 in hashtags:
                hashtag1 = hashtag1.strip()
                hashtag2 = hashtag2.strip()
                if hashtag1!=hashtag2 and (hashtag1 in currhashs) and (hashtag2 in currhashs):
                    CorrCount[hashtag1][hashtag2] += 1

In [None]:
import networkx as nx
from numpy import dot
from numpy.linalg import norm
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain

In [None]:
G = nx.Graph()

def makeGraph():
    for hashtag in currhashs:
        G.add_node(hashtag)
    for hashtag1 in currhashs:
        for hashtag2 in currhashs:
            if hashtag1!=hashtag2:
                if CorrCount[hashtag1][hashtag2]!=0:
                    weight = float(float(CorrCount[hashtag1][hashtag2]) / float(all_hashtags[hashtag1] * all_hashtags[hashtag2]))
                    G.add_edge(hashtag1, hashtag2, weight = weight)
    return

In [None]:
makeGraph()
G = G.to_undirected()
partitionLev2 = community_louvain.best_partition(G)

In [None]:
import statistics
import warnings

warnings.simplefilter("ignore")

ns = []
for node in G.nodes():
    ns.append(all_hashtags[node])
mns = statistics.mean(ns)
for i in range(len(ns)):
    ns[i] /= mns

In [None]:
pos = nx.spring_layout(G)
plt.figure(figsize = (30,30))
# color the nodes according to their partition
cmap = cm.get_cmap('viridis', max(partitionLev2.values()) + 1)
nx.draw_networkx_nodes(G, pos, partitionLev2.keys(), node_size=[100*x for x in ns], cmap=cmap, node_color=list(partitionLev2.values()))
# nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=10, horizontalalignment="right")
plt.show()

In [None]:
# Post language analysis

languageCount = dict()

for post in InstaPostList:
    langCode = post["languageCode"]
    if langCode not in languageCount.keys():
        languageCount[langCode] = 0
    languageCount[langCode]+=1

In [None]:
languageCount = {k:v for k,v in sorted(languageCount.items(), key=lambda item:item[1], reverse=True)}

languageCount