# DMG Mori Sentiment Analysis

## Imports

In [53]:
import json
import boto3 
import numpy as np
import pandas as pd
import math
import torch
import tweetnlp
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import plotly.express as px

## util functions

In [152]:
def connect_to_s3(access_key: str, secret_key: str, region: str):
    print("connecting to S3")
    return boto3.resource(
            service_name='s3',
            region_name=region,
            aws_access_key_id=access_key,
            aws_secret_access_key=secret_key
    )

    
def read_json(path_to_file: str):
    with open(path_to_file) as f:
        data = json.load(f)
    return data

def read_dataframe_from_s3(bucket_name: str, file_name: str):
    s3 = connect_to_s3(access_key=access_key, 
                       secret_key=secret_key,
                       region=region)
    try:
        s3_object_response = s3.Bucket(bucket_name).Object(file_name).get()
        print(f"Succesfully read {filename} from S3")
        return pd.read_csv(s3_object_response["Body"], index_col=0)
    except:
        print("Error: check connection or file name")

def upload_dataframe_to_s3(df, bucket_name, access_key, secret_key, folder_name, file_name):
    df.to_csv(file_name)
    s3 = connect_to_s3(access_key=access_key, 
                           secret_key=secret_key,
                           region=region) 
    try:
        s3.Bucket(bucket_name).upload_file(Filename=file_name, Key=(folder_name+file_name))
        print("Successfully uploaded!!")
    except:
        print("Failed to upload file!!")

## Secret keys

In [3]:
secret = read_json(path_to_file="../secret.json")
access_key = secret["access_key"]
secret_key = secret["secret_key"]
bucket_name = secret["bucket_name"]
region = secret["region"]

## Datasets

In [4]:
linkedin_df_url = "https://phantombuster.s3.amazonaws.com/UhrenaxfEnY/ufbCwx76csm5U7tZEWDzTg/dmg_mori_linkedin_comments.csv"
facebook_df_url = "dmg_mori_facebook_comments.csv"
insta_df_url = "https://phantombuster.s3.amazonaws.com/UhrenaxfEnY/mVSiBkjqgmmHPyNuoDJMzQ/mori_instagram_comments.csv"

In [9]:
s3 = connect_to_s3(access_key=access_key, 
                   secret_key=secret_key,
                   region=region)

facebook_df = read_dataframe_from_s3(bucket_name=bucket_name, file_name=facebook_df_url)
linkedin_df = pd.read_csv(linkedin_df_url)
insta_df = pd.read_csv(insta_df_url)
facebook_df = facebook_df.rename(columns={"text": "comment"})

### Handling missing values

In [19]:
dataset_list = [facebook_df, linkedin_df, insta_df]
for dataset in dataset_list:
    print(f"dataset size:{len(dataset)}, missing comments:{dataset['comment'].isnull().sum()}")

dataset size:600, missing comments:55
dataset size:1249, missing comments:17
dataset size:1083, missing comments:6


In [28]:
facebook_df = facebook_df.dropna(subset=["comment"])
linkedin_df = linkedin_df.dropna(subset=["comment"])
insta_df = insta_df.dropna(subset=["comment"])

### Filtering data for the year 2023

In [35]:
facebook_df.columns

Index(['postDescription', 'comment', 'likesCount', 'facebookUrl', 'sentiment'], dtype='object')

In [36]:
linkedin_df.columns

Index(['profileLink', 'firstName', 'lastName', 'fullName', 'occupation',
       'degree', 'comment', 'commentUrl', 'isFromPostAuthor', 'commentDate',
       'likesCount', 'postUrl', 'timestamp', 'companyUrl', 'companyName',
       'followersCount', 'error', 'sentiment'],
      dtype='object')

In [37]:
linkedin_df["commentDate"] = pd.to_datetime(linkedin_df["commentDate"])
linkedin_df = linkedin_df[linkedin_df["commentDate"] >= "2023-01-01"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linkedin_df["commentDate"] = pd.to_datetime(linkedin_df["commentDate"])


In [39]:
insta_df["commentDate"] = pd.to_datetime(insta_df["commentDate"])
insta_df = insta_df[insta_df["commentDate"] >= "2023-01-01"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  insta_df["commentDate"] = pd.to_datetime(insta_df["commentDate"])


## Initialize Model

In [40]:
model_name = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Sentiment Analysis

In [47]:
def get_sentiments(text):
    try: 
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        sentiment = config.id2label[ranking[0]]
        return sentiment
    except:
        return "neutral"

In [48]:
facebook_df["sentiment"] = facebook_df["comment"].apply(get_sentiments)
linkedin_df["sentiment"] = linkedin_df["comment"].apply(get_sentiments)
insta_df["sentiment"] = insta_df["comment"].apply(get_sentiments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linkedin_df["sentiment"] = linkedin_df["comment"].apply(get_sentiments)


### Topic analysis

In [66]:
def sigmoid(x):
  return 1 / (1 + math.exp(-x))

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all", problem_type="multi_label_classification")
model.eval()
class_mapping = model.config.id2label

Downloading tokenizer_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 354/354 [00:00<00:00, 291kB/s]
Downloading vocab.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 780k/780k [00:00<00:00, 2.83MB/s]
Downloading merges.txt: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 446k/446k [00:00<00:00, 1.99MB/s]
Downloading tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.29M/1.29M [00:00<00:00, 3.19MB/s]
Downloading special_tokens_map.json: 100%|██████████████████████████████████████████████████

['film_tv_&_video', 'music']


In [102]:
def get_topic(text: str):
    try:
        with torch.no_grad():
            tokens = tokenizer(text, return_tensors='pt')
            output = model(**tokens)
            topic_probs = [sigmoid(x) for x in output[0][0].detach().tolist()]
        return class_mapping[np.argmax(topic_probs)]
    except:
        print(text)
        return "others"

In [103]:
facebook_df["topic"] = facebook_df["comment"].apply(get_topic)
linkedin_df["topic"] = linkedin_df["comment"].apply(get_topic)
insta_df["topic"] = insta_df["comment"].apply(get_topic)

FB-Elaine Hsieh 是私人帳號, 如果您想要在我的帳號貼文, 請您依照正常管道進行貼文(1). 請您進入 www.elainehsieh.tw “會員系統”填寫您的會員資料. (2). 請您進入 “訂單填寫” (3). 請您使用信用卡-聯銀卡繳款或是 ATM/銀行 轉帳 (4).當我們收到您的訊息之後, 我們會立刻將您的貼文轉貼至您指定的粉絲群-限時動態/Reels (5). 一旦我們作業完成, 我們會使用 Messenger/ 留言/Email 通知您.這樣就算是完成作業!

FB-Elaine Hsieh is a private account, if you want to post on my account, please follow the normal channels to post (1). Please visit ww.elainehsieh.tw "Member System" to fill in your membership information. (2). Please go to "Order Filling" (3). Please use credit card - Union Bank card payment or ATM/bank transfer (4).When we receive your message, we will immediately post your post into your designated base - Limited Time News/Reels (5). Once our assignment is complete, we will notify you using Messenger/Message/Email. This is complete work.


## sentiment distribution

In [56]:
facebook_df.groupby('sentiment').size()

sentiment
negative     11
neutral     296
positive    238
dtype: int64

In [57]:
linkedin_df.groupby('sentiment').size()

sentiment
negative     30
neutral     422
positive    600
dtype: int64

In [58]:
insta_df.groupby('sentiment').size()

sentiment
negative     62
neutral     272
positive    743
dtype: int64

## Topic Distribution

In [147]:
facebook_df.groupby(['sentiment', 'topic']).size()

sentiment  topic                   
negative   business_&_entrepreneurs      3
           diaries_&_daily_life          1
           news_&_social_concern         1
           science_&_technology          5
           sports                        1
neutral    business_&_entrepreneurs     22
           celebrity_&_pop_culture      25
           diaries_&_daily_life        113
           fashion_&_style               4
           film_tv_&_video              16
           fitness_&_health              1
           food_&_dining                 2
           gaming                        2
           learning_&_educational        2
           music                        17
           news_&_social_concern        23
           other_hobbies                 9
           others                        1
           science_&_technology         17
           sports                       42
positive   arts_&_culture                1
           business_&_entrepreneurs     10
           celebri

In [148]:
linkedin_df.groupby(['sentiment', 'topic']).size()

sentiment  topic                   
negative   business_&_entrepreneurs      2
           diaries_&_daily_life          9
           film_tv_&_video               2
           fitness_&_health              4
           news_&_social_concern         2
           science_&_technology          7
           sports                        4
neutral    arts_&_culture                1
           business_&_entrepreneurs     32
           celebrity_&_pop_culture      19
           diaries_&_daily_life        124
           fashion_&_style               3
           film_tv_&_video               9
           fitness_&_health              6
           food_&_dining                 5
           gaming                       13
           music                        10
           news_&_social_concern        21
           other_hobbies                 8
           science_&_technology        116
           sports                       54
           travel_&_adventure            1
positive   arts_&_

In [149]:
insta_df.groupby(['sentiment', 'topic']).size()

sentiment  topic                   
negative   business_&_entrepreneurs      4
           celebrity_&_pop_culture       2
           diaries_&_daily_life         10
           gaming                        1
           news_&_social_concern        34
           science_&_technology          9
           sports                        2
neutral    arts_&_culture                1
           business_&_entrepreneurs     21
           celebrity_&_pop_culture      12
           diaries_&_daily_life         90
           fashion_&_style               1
           film_tv_&_video               9
           fitness_&_health              2
           food_&_dining                 3
           gaming                       12
           music                         8
           news_&_social_concern        22
           other_hobbies                 5
           science_&_technology         49
           sports                       37
positive   arts_&_culture                1
           busines

## Save the dataframes

In [151]:
help(upload_dataframe_to_s3)

Help on function upload_dataframe_to_s3 in module __main__:

upload_dataframe_to_s3(df, bucket_name, folder_name, file_name)



In [153]:
folder_name = "output/"
facebook_file_name = "dmg_mori_facbook_2023_sentiments.csv"
linkedin_file_name = "dmg_mori_linkedin_2023_sentiments.csv"
insta_file_name = "dmg_mori_insta_2023_sentiments.csv"

upload_dataframe_to_s3(df=facebook_df, access_key=access_key, secret_key=secret_key, bucket_name=bucket_name, file_name=facebook_file_name, folder_name=folder_name)
upload_dataframe_to_s3(df=linkedin_df, access_key=access_key, secret_key=secret_key, bucket_name=bucket_name, file_name=linkedin_file_name, folder_name=folder_name)
upload_dataframe_to_s3(df=insta_df, access_key=access_key, secret_key=secret_key, bucket_name=bucket_name, file_name=insta_file_name, folder_name=folder_name)


invalid value encountered in cast



connecting to S3
Successfully uploaded!!
connecting to S3
Successfully uploaded!!
connecting to S3
Successfully uploaded!!
