In [110]:
import sys

sys.path.append("../")

# DMG Mori Sentiment Analysis

## Imports

In [111]:
from src.data_management import DataManagment
from src.model.sentiment_model import SentimentModel

In [112]:
data_manager = DataManagment()

Created a connection to S3


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Datasets

In [113]:
folder_name = "raw_data"
file_facebook_comments = "dmg_mori_facebook_comments.csv"
file_linkedin_df_comments = "dmg_mori_linkedin_comments.csv"
file_insta_comments = "mori_instagram_comments.csv"
file_facebook_mentions = "DMG_mori_facebook_mentions.csv"
file_instagram_mentions = "DMG_mori_insta_mentions.csv"

In [114]:
facebook_comments_df = data_manager.read_dataframe_from_s3(file_name=file_facebook_comments, folder_name=folder_name)
insta_comments_df = data_manager.read_dataframe_from_s3(file_name=file_insta_comments, folder_name=folder_name)
linkedin_comments_df = data_manager.read_dataframe_from_s3(file_name=file_linkedin_df_comments, folder_name=folder_name)
facebook_mentions_df = data_manager.read_dataframe_from_s3(file_name=file_facebook_mentions, folder_name=folder_name)
instagram_mentions_df = data_manager.read_dataframe_from_s3(file_name=file_instagram_mentions, folder_name=folder_name)

Succesfully read dmg_mori_facebook_comments.csv from S3
Succesfully read mori_instagram_comments.csv from S3
Succesfully read dmg_mori_linkedin_comments.csv from S3
Succesfully read DMG_mori_facebook_mentions.csv from S3
Succesfully read DMG_mori_insta_mentions.csv from S3


## Preprocessing the Dataset

### Renaming the columns

In [118]:
print("Facebook comments: ", facebook_comments_df.columns)
print("Instagram comments: ", insta_comments_df.columns)
print("LinkedIn comments: ", linkedin_comments_df.columns)
print("Facebook mentions: ", facebook_mentions_df.columns)
print("Instagram mentions: ", instagram_mentions_df.columns)

Facebook comments:  Index(['postDescription', 'text', 'likesCount', 'facebookUrl'], dtype='object')
Instagram comments:  Index(['error', 'timestamp', 'profilePictureUrl', 'username', 'profileUrl',
       'comment', 'likeCount', 'replyCount', 'commentDate', 'commentId',
       'ownerId'],
      dtype='object')
LinkedIn comments:  Index(['firstName', 'lastName', 'fullName', 'occupation', 'degree', 'comment',
       'commentUrl', 'isFromPostAuthor', 'commentDate', 'likesCount',
       'postUrl', 'timestamp', 'companyUrl', 'companyName', 'followersCount',
       'error'],
      dtype='object')
Facebook mentions:  Index(['Page Admin Top Country', 'Post Created', 'Type', 'Total Interactions',
       'Post_content'],
      dtype='object')
Instagram mentions:  Index(['Followers at Posting', 'Post Created', 'Type', 'Total Interactions',
       'URL', 'Post_content'],
      dtype='object')


In [119]:
facebook_comments_df = facebook_comments_df.rename(columns={"text": "comment"})
facebook_mentions_df = facebook_mentions_df.rename(columns={"Post_content": "post"})
instagram_mentions_df = instagram_mentions_df.rename(columns={"Post_content": "post"})

### Handling missing values

In [123]:
print("Facebook comments(null): ", facebook_comments_df.isnull().sum()["comment"])
print("Instagram comments(null): ", insta_comments_df.isnull().sum()["comment"])
print("LinkedIn comments(null): ", linkedin_comments_df.isnull().sum()["comment"])
print("Facebook mentions(null): ", facebook_mentions_df.isnull().sum()["post"])
print("Instagram mentions(null): ", instagram_mentions_df.isnull().sum()["post"])

Facebook comments(null):  55
Instagram comments(null):  6
LinkedIn comments(null):  17
Facebook mentions(null):  23
Instagram mentions(null):  0


In [124]:
facebook_comments_df = facebook_comments_df.dropna(subset=["comment"])
insta_comments_df = insta_comments_df.dropna(subset=["comment"])
linkedin_comments_df = linkedin_comments_df.dropna(subset=["comment"])
facebook_mentions_df = facebook_mentions_df.dropna(subset=["post"])
instagram_mentions_df = instagram_mentions_df.dropna(subset=["post"])

In [125]:
print("Facebook comments(null): ", facebook_comments_df.isnull().sum()["comment"])
print("Instagram comments(null): ", insta_comments_df.isnull().sum()["comment"])
print("LinkedIn comments(null): ", linkedin_comments_df.isnull().sum()["comment"])
print("Facebook mentions(null): ", facebook_mentions_df.isnull().sum()["post"])
print("Instagram mentions(null): ", instagram_mentions_df.isnull().sum()["post"])

Facebook comments(null):  0
Instagram comments(null):  0
LinkedIn comments(null):  0
Facebook mentions(null):  0
Instagram mentions(null):  0


### Filtering data for the year 2023

In [128]:
linkedin_comments_df["commentDate"] = pd.to_datetime(linkedin_comments_df["commentDate"])
linkedin_comments_df = linkedin_comments_df[linkedin_comments_df["commentDate"] >= "2023-01-01"]

In [130]:
insta_comments_df["commentDate"] = pd.to_datetime(insta_comments_df["commentDate"])
insta_comments_df = insta_comments_df[insta_comments_df["commentDate"] >= "2023-01-01"]

## Initialize Model

In [131]:
model = SentimentModel()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Sentiment Analysis

In [132]:
facebook_comments_df["sentiment"] = facebook_comments_df["comment"].apply(lambda x: model.get_sentiment_prediction(text=x, best_score=True)["sentiment"])
insta_comments_df["sentiment"] = insta_comments_df["comment"].apply(lambda x: model.get_sentiment_prediction(text=x, best_score=True)["sentiment"])
linkedin_comments_df["sentiment"] = linkedin_comments_df["comment"].apply(lambda x: model.get_sentiment_prediction(text=x, best_score=True)["sentiment"])
facebook_mentions_df["sentiment"] = facebook_mentions_df["post"].apply(lambda x: model.get_sentiment_prediction(text=x, best_score=True)["sentiment"])
instagram_mentions_df["sentiment"] = instagram_mentions_df["post"].apply(lambda x: model.get_sentiment_prediction(text=x, best_score=True)["sentiment"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_comments_df["sentiment"] = facebook_comments_df["comment"].apply(lambda x: model.get_sentiment_prediction(text=x, best_score=True)["sentiment"])


### Topic analysis

In [66]:
def sigmoid(x):
  return 1 / (1 + math.exp(-x))

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-dec2021-tweet-topic-multi-all", problem_type="multi_label_classification")
model.eval()
class_mapping = model.config.id2label

Downloading tokenizer_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 354/354 [00:00<00:00, 291kB/s]
Downloading vocab.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 780k/780k [00:00<00:00, 2.83MB/s]
Downloading merges.txt: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 446k/446k [00:00<00:00, 1.99MB/s]
Downloading tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.29M/1.29M [00:00<00:00, 3.19MB/s]
Downloading special_tokens_map.json: 100%|██████████████████████████████████████████████████

['film_tv_&_video', 'music']


In [102]:
def get_topic(text: str):
    try:
        with torch.no_grad():
            tokens = tokenizer(text, return_tensors='pt')
            output = model(**tokens)
            topic_probs = [sigmoid(x) for x in output[0][0].detach().tolist()]
        return class_mapping[np.argmax(topic_probs)]
    except:
        return "others"

In [None]:
facebook_df["topic"] = facebook_df["comment"].apply(get_topic)
linkedin_df["topic"] = linkedin_df["comment"].apply(get_topic)
insta_df["topic"] = insta_df["comment"].apply(get_topic)

## sentiment distribution

In [None]:
facebook_df.groupby('sentiment').size()

In [57]:
linkedin_df.groupby('sentiment').size()

sentiment
negative     30
neutral     422
positive    600
dtype: int64

In [58]:
insta_df.groupby('sentiment').size()

sentiment
negative     62
neutral     272
positive    743
dtype: int64

In [78]:
facebook_mentions_df.groupby('sentiment').size()

sentiment
negative     16
neutral     508
positive    261
dtype: int64

In [92]:
instagram_mentions_df.groupby('sentiment').size()

sentiment
neutral     43
positive    32
dtype: int64

In [95]:
pd.DataFrame({"facebook": {"negative": 42, 
              "neutral": 482, 
              "positive": 134},
 "instagram": {"negative": 9, 
              "neutral": 43, 
              "positive": 23}})

Unnamed: 0,facebook,instagram
negative,42,9
neutral,482,43
positive,134,23


## Topic Distribution

In [147]:
facebook_df.groupby(['sentiment', 'topic']).size()

sentiment  topic                   
negative   business_&_entrepreneurs      3
           diaries_&_daily_life          1
           news_&_social_concern         1
           science_&_technology          5
           sports                        1
neutral    business_&_entrepreneurs     22
           celebrity_&_pop_culture      25
           diaries_&_daily_life        113
           fashion_&_style               4
           film_tv_&_video              16
           fitness_&_health              1
           food_&_dining                 2
           gaming                        2
           learning_&_educational        2
           music                        17
           news_&_social_concern        23
           other_hobbies                 9
           others                        1
           science_&_technology         17
           sports                       42
positive   arts_&_culture                1
           business_&_entrepreneurs     10
           celebri

In [148]:
linkedin_df.groupby(['sentiment', 'topic']).size()

sentiment  topic                   
negative   business_&_entrepreneurs      2
           diaries_&_daily_life          9
           film_tv_&_video               2
           fitness_&_health              4
           news_&_social_concern         2
           science_&_technology          7
           sports                        4
neutral    arts_&_culture                1
           business_&_entrepreneurs     32
           celebrity_&_pop_culture      19
           diaries_&_daily_life        124
           fashion_&_style               3
           film_tv_&_video               9
           fitness_&_health              6
           food_&_dining                 5
           gaming                       13
           music                        10
           news_&_social_concern        21
           other_hobbies                 8
           science_&_technology        116
           sports                       54
           travel_&_adventure            1
positive   arts_&_

In [149]:
insta_df.groupby(['sentiment', 'topic']).size()

sentiment  topic                   
negative   business_&_entrepreneurs      4
           celebrity_&_pop_culture       2
           diaries_&_daily_life         10
           gaming                        1
           news_&_social_concern        34
           science_&_technology          9
           sports                        2
neutral    arts_&_culture                1
           business_&_entrepreneurs     21
           celebrity_&_pop_culture      12
           diaries_&_daily_life         90
           fashion_&_style               1
           film_tv_&_video               9
           fitness_&_health              2
           food_&_dining                 3
           gaming                       12
           music                         8
           news_&_social_concern        22
           other_hobbies                 5
           science_&_technology         49
           sports                       37
positive   arts_&_culture                1
           busines

## Save the dataframes

In [153]:
folder_name = "output/"
facebook_comments_file = "dmg_mori_facbook_comments_2023_sentiments.csv"
linkedin_comments_file = "dmg_mori_linkedin_comments_2023_sentiments.csv"
insta_comments_file = "dmg_mori_insta_comments_2023_sentiments.csv"
facebook_mentions_file = "dmg_mori_facbook_mentions_2023_sentiments.csv"
insta_mentions_file = "dmg_mori_instagram_mentions_2023_sentiments.csv"



upload_dataframe_to_s3(df=facebook_comments_df, access_key=access_key, secret_key=secret_key, bucket_name=bucket_name, file_name=facebook_comments_file, folder_name=folder_name)
upload_dataframe_to_s3(df=linkedin_comments_df, access_key=access_key, secret_key=secret_key, bucket_name=bucket_name, file_name=linkedin_comments_file, folder_name=folder_name)
upload_dataframe_to_s3(df=insta_comments_df, access_key=access_key, secret_key=secret_key, bucket_name=bucket_name, file_name=insta_comments_file, folder_name=folder_name)
upload_dataframe_to_s3(df=facebook_mentions_df, access_key=access_key, secret_key=secret_key, bucket_name=bucket_name, file_name=facebook_mentions_file, folder_name=folder_name)
upload_dataframe_to_s3(df=instagram_mentions_df, access_key=access_key, secret_key=secret_key, bucket_name=bucket_name, file_name=insta_mentions_file, folder_name=folder_name)


invalid value encountered in cast



connecting to S3
Successfully uploaded!!
connecting to S3
Successfully uploaded!!
connecting to S3
Successfully uploaded!!


In [None]:
insta: https://dmbmoridata.s3.eu-central-1.amazonaws.com/output/dmg_mori_facbook_mentions_2023_sentiments.csv?response-content-disposition=inline&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEPn%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaDGV1LWNlbnRyYWwtMSJHMEUCIQCWkvOG7rPMSHd66T0sA3pCIC4r%2FyI1vjbIOHuXk0rcVQIgSClBZtv%2FgSZ608Qwv8F6AGOD0woVc0pYyO%2BHe3MrLd8q5AIIExADGgw4ODUxMjYzMTM5OTMiDK7tOXT4Og%2FeXbzwtirBAiM7inpShZm9piHtnve8TpWb9kM80Co7OtdaWugjkSOQDE8BZQL50IIg4Rsp1H7%2BD1bNGrh0ZigQlSCT8TkjRpsUT8K5FHNRY%2FBCRE7zlaR31y1movpH73GxACot3AFcHy4IErMPzavWAy64q32sl3pc0KaqzYPjevPJflT%2FVZ3jmR7zaAsTS6y6K696Dn6uZdvAXmAv5qcRrrnLwPObvUblN%2BrSAX08my7P%2FhgfIgUzafxSYI0ORSLkLCAiEgl6K5bgumNAKA5BZqpXTl5SlxWRecY0lbhhA8tonUmx3P1qoiWgJut7UE6Suf0GFBXnXwWrZYLBg6LtW91dv9s0KcDWAbGT6Xearjhg6uh3wgPECcquqt55jCcXekfsnmUPY31gEZtjeX4IhAjKVktedkOQ8pYSzBa%2F3wbAz70INnLwfjDX9Z6pBjqzAjGIo0oGZead6Kh6wdZn52km0k69cRZTmK3DdL3jO1sRAo4gfJTmLkNW2H7gomlRZVtUAIOeOUXL7HVNtfOs%2BEH0HXdvDfCO8yhULaMc%2BMTiOfdIqPMbkZSwCTVusWrZWf%2BX3CKyDvrW2QjKoRjILXlDbcFp%2BvCBCch5VRWVvpWK%2Fd3rI9QryHrJJADbCnTtfwz3NPUxBHJ4EAAcQ4fvWZpjTAMI9gyDNOgqWWA%2BzfUF2rqdgrE8VdE1E8GFukypPFNcU%2BvAu8ZoDIbJ%2F9wKsuYQqjqXN7rdgIvTUoYJ%2F9HltpYnoFHte3LUEBxWIDKYFPWsZ3xrEKnAzWvFQ%2BVh5Mo90y2bFs%2FN353ANk8HteYFLNPpwDgMSkoN3936AuEXdL8DlMP5XcORSbpp9VUYnm6fp%2FA%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20231012T134841Z&X-Amz-SignedHeaders=host&X-Amz-Expires=43199&X-Amz-Credential=ASIA44FNDXAEXL2I6AIX%2F20231012%2Feu-central-1%2Fs3%2Faws4_request&X-Amz-Signature=2aa61867318aafc9a960e91d8e78f7588f16d46f1c53c17975e2f3e4faf47f07

In [18]:
facebook: https://dmbmoridata.s3.eu-central-1.amazonaws.com/output/dmg_mori_instagram_mentions_2023_sentiments.csv?response-content-disposition=inline&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEPn%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaDGV1LWNlbnRyYWwtMSJHMEUCIQCWkvOG7rPMSHd66T0sA3pCIC4r%2FyI1vjbIOHuXk0rcVQIgSClBZtv%2FgSZ608Qwv8F6AGOD0woVc0pYyO%2BHe3MrLd8q5AIIExADGgw4ODUxMjYzMTM5OTMiDK7tOXT4Og%2FeXbzwtirBAiM7inpShZm9piHtnve8TpWb9kM80Co7OtdaWugjkSOQDE8BZQL50IIg4Rsp1H7%2BD1bNGrh0ZigQlSCT8TkjRpsUT8K5FHNRY%2FBCRE7zlaR31y1movpH73GxACot3AFcHy4IErMPzavWAy64q32sl3pc0KaqzYPjevPJflT%2FVZ3jmR7zaAsTS6y6K696Dn6uZdvAXmAv5qcRrrnLwPObvUblN%2BrSAX08my7P%2FhgfIgUzafxSYI0ORSLkLCAiEgl6K5bgumNAKA5BZqpXTl5SlxWRecY0lbhhA8tonUmx3P1qoiWgJut7UE6Suf0GFBXnXwWrZYLBg6LtW91dv9s0KcDWAbGT6Xearjhg6uh3wgPECcquqt55jCcXekfsnmUPY31gEZtjeX4IhAjKVktedkOQ8pYSzBa%2F3wbAz70INnLwfjDX9Z6pBjqzAjGIo0oGZead6Kh6wdZn52km0k69cRZTmK3DdL3jO1sRAo4gfJTmLkNW2H7gomlRZVtUAIOeOUXL7HVNtfOs%2BEH0HXdvDfCO8yhULaMc%2BMTiOfdIqPMbkZSwCTVusWrZWf%2BX3CKyDvrW2QjKoRjILXlDbcFp%2BvCBCch5VRWVvpWK%2Fd3rI9QryHrJJADbCnTtfwz3NPUxBHJ4EAAcQ4fvWZpjTAMI9gyDNOgqWWA%2BzfUF2rqdgrE8VdE1E8GFukypPFNcU%2BvAu8ZoDIbJ%2F9wKsuYQqjqXN7rdgIvTUoYJ%2F9HltpYnoFHte3LUEBxWIDKYFPWsZ3xrEKnAzWvFQ%2BVh5Mo90y2bFs%2FN353ANk8HteYFLNPpwDgMSkoN3936AuEXdL8DlMP5XcORSbpp9VUYnm6fp%2FA%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20231012T134935Z&X-Amz-SignedHeaders=host&X-Amz-Expires=43200&X-Amz-Credential=ASIA44FNDXAEXL2I6AIX%2F20231012%2Feu-central-1%2Fs3%2Faws4_request&X-Amz-Signature=443b83cc8708bdbb520e5928b59a14bc7182283d6db290f8b6e350031e302b48

torch.Size([1, 4802])