In [39]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd
import warnings
import ast
import os


warnings.filterwarnings('ignore') # for more clear output

# Reading dataset

In [40]:
path = "../instagram_data.csv"
data =pd.read_csv(path)
data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following
0,36063641,christendominique,C3_GS1ASeWI,False,I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @...,268,16382,1709327000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,True,christendominique,2144626.0,1021.0


In [41]:
data.shape

(11692, 14)

In [42]:
data.isna().sum()

owner_id              0
owner_username        2
shortcode             2
is_video              2
caption             161
comments              2
likes                 2
created_at            3
location           7169
imageUrl              3
multiple_images       3
username           2115
followers          2115
following          2115
dtype: int64

drop nan values in caption column

In [43]:
data.dropna(subset=["caption", "username"], inplace=True)

see if there is any diffrance in user names on two column for each row 
there are some cases but just when username is NAN


In [44]:
data.loc[(data.owner_username != data.username) & (data.username.isna() != True)]

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following


In [45]:
data.isna().sum()

owner_id              0
owner_username        0
shortcode             0
is_video              0
caption               0
comments              0
likes                 0
created_at            0
location           5697
imageUrl              0
multiple_images       0
username              0
followers             0
following             0
dtype: int64

-------

# Sentiment analysis with Roberta


I want to perform sentiment analysis on captions in this dataset.
for this analysis i will use `transform` library and `Roberta` model.

In [46]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [52]:
def emotional_text(row):

    text = row["caption"]
    encoded_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    output = model(**encoded_text)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {"neg": scores[0], "neu": scores[1], "pos": scores[2]}

    max_key, max_value = max(scores_dict.items(), key=lambda item: item[1])

    return scores_dict, max_key, max_value

In [55]:
data["caption_emotions"], data["most_powerful_emotion"], data["emotion_score"] = zip(*data.apply(emotional_text, axis=1))
data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following,caption_emotions,most_powerful_emotion,emotion_score
0,36063641,christendominique,C3_GS1ASeWI,False,I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @...,268,16382,1.709327e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,True,christendominique,2144626.0,1021.0,"{'neg': 0.034616627, 'neu': 0.6108251, 'pos': ...",neu,0.610825
1,36063641,christendominique,C38ivgNS3IX,True,😮‍💨Brow tips I really wish I would have know w...,138,9267,1.709241e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.17225474, 'neu': 0.5420607, 'pos': 0...",neu,0.542061
2,36063641,christendominique,C35-Dd9SO1b,True,OMG I can’t believe it’s already been 1 yr sin...,1089,10100,1.709155e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.09999306, 'neu': 0.23255272, 'pos': ...",pos,0.667454
3,36063641,christendominique,C33TadDMisq,True,90’s Glam was Pam! \n\nMakeup \n@smashboxcosme...,271,6943,1.709065e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.14957997, 'neu': 0.6310919, 'pos': 0...",neu,0.631092
4,36063641,christendominique,C3s-Cm1yCba,True,Chiseled & Sculptured 🏛️\n\n Contour @westmana...,145,17158,1.708718e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.06114171, 'neu': 0.7887955, 'pos': 0...",neu,0.788795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11686,877755538,m_eye_nd,C35xLDsPPcj,True,which part is it? 🥺💛🫂,0,123,1.709148e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,m_eye_nd,331373.0,260.0,"{'neg': 0.07224331, 'neu': 0.74660265, 'pos': ...",neu,0.746603
11687,877755538,m_eye_nd,C35pT7pPs8e,False,which one is your fav 🥺💛🌌🌊🥀,4,1241,1.709144e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,False,m_eye_nd,331373.0,260.0,"{'neg': 0.06301919, 'neu': 0.59165555, 'pos': ...",neu,0.591656
11688,877755538,m_eye_nd,C34fjtCvi45,False,do you miss that? 🥺💛😭,11,806,1.709105e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,False,m_eye_nd,331373.0,260.0,"{'neg': 0.59099007, 'neu': 0.35064402, 'pos': ...",neg,0.590990
11689,877755538,m_eye_nd,C33nKF4vCQR,False,"but it is wonderful while it lasts, right? 🥺💛",15,4828,1.709076e+09,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,False,m_eye_nd,331373.0,260.0,"{'neg': 0.02124086, 'neu': 0.0943668, 'pos': 0...",pos,0.884392


In [57]:
folder_path = '../2.Data_with_detected_caption_emotions'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save each DataFrame to a CSV file in the new folder
data.to_csv(os.path.join(folder_path, 'Data_with_detected_caption_emotions.csv'), index=False)

---

# Locational Data
data rows with filled location values

In [58]:
locational_data = data.loc[data.location.isna() != True]
locational_data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following,caption_emotions,most_powerful_emotion,emotion_score
28,40506412,christinehmcconnell,CzJtDjegmdI,False,Had an amazing time in #salem for Halloween th...,466,79000,1698945000.0,"{'id': '34537076', 'has_public_page': True, 'n...",https://instagram.ffor43-1.fna.fbcdn.net/v/t39...,True,christinehmcconnell,597189.0,1576.0,"{'neg': 0.020397255, 'neu': 0.08311723, 'pos':...",pos,0.896486


*`coverting all values to dictionary`*

In [59]:
def dict_maker(row):
    return ast.literal_eval(row["location"])

locational_data["location"] = locational_data.apply(dict_maker, axis=1)

*`getting all city names in dictionary, and then replace the final list of city names with current column of location`*

In [60]:
location = []
for key in locational_data.location:
    location.append(key["slug"])

# replace old colum with new list
locational_data.location = location

In [61]:
locational_data = locational_data[locational_data['username'].notna()]
locational_data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following,caption_emotions,most_powerful_emotion,emotion_score
28,40506412,christinehmcconnell,CzJtDjegmdI,False,Had an amazing time in #salem for Halloween th...,466,79000,1698945000.0,salem-massachusetts,https://instagram.ffor43-1.fna.fbcdn.net/v/t39...,True,christinehmcconnell,597189.0,1576.0,"{'neg': 0.020397255, 'neu': 0.08311723, 'pos':...",pos,0.896486


i will convert this to dataset "Locational_dataset" 

In [62]:
locational_data.isna().sum()

owner_id                 0
owner_username           0
shortcode                0
is_video                 0
caption                  0
comments                 0
likes                    0
created_at               0
location                 0
imageUrl                 0
multiple_images          0
username                 0
followers                0
following                0
caption_emotions         0
most_powerful_emotion    0
emotion_score            0
dtype: int64

In [63]:
locational_data.shape

(3744, 17)

----

# Non-Locational Data

In [64]:
nonlocational_data = data.loc[data.username.isna() == False]
nonlocational_data.drop(columns="location", inplace=True)

In [65]:
nonlocational_data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,imageUrl,multiple_images,username,followers,following,caption_emotions,most_powerful_emotion,emotion_score
0,36063641,christendominique,C3_GS1ASeWI,False,I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @...,268,16382,1709327000.0,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,True,christendominique,2144626.0,1021.0,"{'neg': 0.034616627, 'neu': 0.6108251, 'pos': ...",neu,0.610825


In [66]:
nonlocational_data.isna().sum()

owner_id                 0
owner_username           0
shortcode                0
is_video                 0
caption                  0
comments                 0
likes                    0
created_at               0
imageUrl                 0
multiple_images          0
username                 0
followers                0
following                0
caption_emotions         0
most_powerful_emotion    0
emotion_score            0
dtype: int64

---

# Exporting Data 
Exporting Data on csv file in `prepare data` folder

In [67]:
folder_path = '../2.Prepared Data'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save each DataFrame to a CSV file in the new folder
nonlocational_data.to_csv(os.path.join(folder_path, 'non_locational_data.csv'), index=False)
locational_data.to_csv(os.path.join(folder_path, 'locational_data.csv'), index=False)