In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd
import warnings
import ast
import os


warnings.filterwarnings('ignore') # for more clear output

# Reading dataset

In [3]:
path = "../instagram_data.csv"
data =pd.read_csv(path)
data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following
0,36063641,christendominique,C3_GS1ASeWI,False,I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @...,268,16382,1709327000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,True,christendominique,2144626.0,1021.0


In [4]:
data.shape

(11692, 14)

In [5]:
data.isna().sum()

owner_id              0
owner_username        2
shortcode             2
is_video              2
caption             161
comments              2
likes                 2
created_at            3
location           7169
imageUrl              3
multiple_images       3
username           2115
followers          2115
following          2115
dtype: int64

drop nan values in caption column

In [6]:
data.dropna(subset=["caption", "username"], inplace=True)

see if there is any diffrance in user names on two column for each row 
there are some cases but just when username is NAN


In [7]:
data.loc[(data.owner_username != data.username) & (data.username.isna() != True)]

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following


In [8]:
data.isna().sum()

owner_id              0
owner_username        0
shortcode             0
is_video              0
caption               0
comments              0
likes                 0
created_at            0
location           5697
imageUrl              0
multiple_images       0
username              0
followers             0
following             0
dtype: int64

---
# Handling duplicates

In [9]:
data.shape

(9441, 14)

In [10]:
data.drop_duplicates(inplace=True)


In [11]:
data.shape

(6844, 14)

-------

# Check DataType 

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6844 entries, 0 to 11690
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   owner_id               6844 non-null   object 
 1   owner_username         6844 non-null   object 
 2   shortcode              6844 non-null   object 
 3   is_video               6844 non-null   object 
 4   caption                6844 non-null   object 
 5   comments               6844 non-null   object 
 6   likes                  6844 non-null   object 
 7   created_at             6844 non-null   float64
 8   location               2784 non-null   object 
 9   imageUrl               6844 non-null   object 
 10  multiple_images        6844 non-null   object 
 11  username               6844 non-null   object 
 12  followers              6844 non-null   float64
 13  following              6844 non-null   float64
 14  caption_emotions       6844 non-null   object 
 15  most_pow

In [20]:
data["is_video"] = data["is_video"].astype(bool)
data["comments"] = data["comments"].astype(int)
data["likes"] = data["likes"].astype(int)
data["multiple_images"] = data["multiple_images"].astype(bool)

-----

# Sentiment analysis with Roberta


I want to perform sentiment analysis on captions in this dataset.
for this analysis i will use `transform` library and `Roberta` model.

In [21]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [22]:
def emotional_text(row):

    text = row["caption"]
    encoded_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    output = model(**encoded_text)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {"neg": scores[0], "neu": scores[1], "pos": scores[2]}

    max_key, max_value = max(scores_dict.items(), key=lambda item: item[1])

    return scores_dict, max_key, max_value

In [23]:
data["caption_emotions"], data["most_powerful_emotion"], data["emotion_score"] = zip(*data.apply(emotional_text, axis=1))
data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following,caption_emotions,most_powerful_emotion,emotion_score
0,36063641,christendominique,C3_GS1ASeWI,True,I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @...,268,16382,1709327000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,True,christendominique,2144626.0,1021.0,"{'neg': 0.034616627, 'neu': 0.6108251, 'pos': ...",neu,0.610825


In [24]:
folder_path = '../2.Data_with_detected_caption_emotions'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save each DataFrame to a CSV file in the new folder
data.to_csv(os.path.join(folder_path, 'Data_with_detected_caption_emotions.csv'), index=False)

----

# Engagement Rate
#### `Instagram Post Engagement Rate = (Post’s Likes + Post’s Comments) / Followers * 100%` 
Average Engagement Rates:
As an industry standard, here are some benchmarks for engagement rates on Instagram:

Between 1% and 3.5%: This is considered an average/good engagement rate.

Between 3.5% and 6%: This indicates a high engagement rate.

Above 6%: A rate above 6% is considered very high engagement2.

Keep in mind that these ranges can vary based on factors like niche, content type, and audience demographics.

In [27]:
def IPER(row): # Instagram Post Engagement Rate

    iper = (row["likes"] + row["comments"]) / row["followers"]
    return iper * 100

data["engagement_rate(%)"] = data.apply(IPER, axis=1)
data.head(2)


Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following,caption_emotions,most_powerful_emotion,emotion_score,engagement_rate,engagement_rate(%)
0,36063641,christendominique,C3_GS1ASeWI,True,I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @...,268,16382,1709327000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,True,christendominique,2144626.0,1021.0,"{'neg': 0.034616627, 'neu': 0.6108251, 'pos': ...",neu,0.610825,0.007764,0.776359
1,36063641,christendominique,C38ivgNS3IX,True,😮‍💨Brow tips I really wish I would have know w...,138,9267,1709241000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.17225474, 'neu': 0.5420607, 'pos': 0...",neu,0.542061,0.004385,0.438538


# separating Hashtags

In [28]:
def extract_hashtags(row):
    text = row["caption"]
    hashtag_list = []
    for word in text.split():
 
        if word[0] == '#':
            hashtag_list.append(word[1:])
    return hashtag_list

data["hashtag"] = data.apply(extract_hashtags, axis=1)
 

In [30]:
data.head()

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,multiple_images,username,followers,following,caption_emotions,most_powerful_emotion,emotion_score,engagement_rate,engagement_rate(%),hashtag
0,36063641,christendominique,C3_GS1ASeWI,True,I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @...,268,16382,1709327000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,True,christendominique,2144626.0,1021.0,"{'neg': 0.034616627, 'neu': 0.6108251, 'pos': ...",neu,0.610825,0.007764,0.776359,[]
1,36063641,christendominique,C38ivgNS3IX,True,😮‍💨Brow tips I really wish I would have know w...,138,9267,1709241000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.17225474, 'neu': 0.5420607, 'pos': 0...",neu,0.542061,0.004385,0.438538,"[browtips, eyebrowtutorial, browmakeup, eyebro..."
2,36063641,christendominique,C35-Dd9SO1b,True,OMG I can’t believe it’s already been 1 yr sin...,1089,10100,1709155000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.09999306, 'neu': 0.23255272, 'pos': ...",pos,0.667454,0.005217,0.521723,"[sigmaxchristendominique, makeupforbeginners, ..."
3,36063641,christendominique,C33TadDMisq,True,90’s Glam was Pam! \n\nMakeup \n@smashboxcosme...,271,6943,1709065000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.14957997, 'neu': 0.6310919, 'pos': 0...",neu,0.631092,0.003364,0.336376,"[90smakeup, sbxambassador, 90smakeup, glammake..."
4,36063641,christendominique,C3s-Cm1yCba,True,Chiseled & Sculptured 🏛️\n\n Contour @westmana...,145,17158,1708718000.0,,https://instagram.flba2-1.fna.fbcdn.net/v/t51....,False,christendominique,2144626.0,1021.0,"{'neg': 0.06114171, 'neu': 0.7887955, 'pos': 0...",neu,0.788795,0.008068,0.806807,"[contour, makeup, makeuptutorial, contourtutor..."


---

# Adding data and time

In [31]:
data["date"] = pd.to_datetime(data['created_at'], unit='s').dt.date
data["time"] = pd.to_datetime(data['created_at'], unit='s').dt.time

----

# Locational Data
data rows with filled location values

In [32]:
locational_data = data.loc[data.location.isna() != True]
locational_data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,...,followers,following,caption_emotions,most_powerful_emotion,emotion_score,engagement_rate,engagement_rate(%),hashtag,date,time
28,40506412,christinehmcconnell,CzJtDjegmdI,True,Had an amazing time in #salem for Halloween th...,466,79000,1698945000.0,"{'id': '34537076', 'has_public_page': True, 'n...",https://instagram.ffor43-1.fna.fbcdn.net/v/t39...,...,597189.0,1576.0,"{'neg': 0.020397255, 'neu': 0.08311723, 'pos':...",pos,0.896486,0.133067,13.306675,"[salem, halloween, salemjack]",2023-11-02,17:13:26


*`coverting all values to dictionary`*

In [33]:
def dict_maker(row):
    return ast.literal_eval(row["location"])

locational_data["location"] = locational_data.apply(dict_maker, axis=1)

*`getting all city names in dictionary, and then replace the final list of city names with current column of location`*

In [34]:
location = []
for key in locational_data.location:
    location.append(key["slug"])

# replace old colum with new list
locational_data.location = location

In [35]:
locational_data = locational_data[locational_data['username'].notna()]
locational_data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,location,imageUrl,...,followers,following,caption_emotions,most_powerful_emotion,emotion_score,engagement_rate,engagement_rate(%),hashtag,date,time
28,40506412,christinehmcconnell,CzJtDjegmdI,True,Had an amazing time in #salem for Halloween th...,466,79000,1698945000.0,salem-massachusetts,https://instagram.ffor43-1.fna.fbcdn.net/v/t39...,...,597189.0,1576.0,"{'neg': 0.020397255, 'neu': 0.08311723, 'pos':...",pos,0.896486,0.133067,13.306675,"[salem, halloween, salemjack]",2023-11-02,17:13:26


i will convert this to dataset "Locational_dataset" 

In [36]:
locational_data.isna().sum()

owner_id                 0
owner_username           0
shortcode                0
is_video                 0
caption                  0
comments                 0
likes                    0
created_at               0
location                 0
imageUrl                 0
multiple_images          0
username                 0
followers                0
following                0
caption_emotions         0
most_powerful_emotion    0
emotion_score            0
engagement_rate          0
engagement_rate(%)       0
hashtag                  0
date                     0
time                     0
dtype: int64

In [37]:
locational_data.shape

(2784, 22)

----

# Non-Locational Data

In [38]:
nonlocational_data = data.loc[data.username.isna() == False]
nonlocational_data.drop(columns="location", inplace=True)

In [39]:
nonlocational_data.head(1)

Unnamed: 0,owner_id,owner_username,shortcode,is_video,caption,comments,likes,created_at,imageUrl,multiple_images,...,followers,following,caption_emotions,most_powerful_emotion,emotion_score,engagement_rate,engagement_rate(%),hashtag,date,time
0,36063641,christendominique,C3_GS1ASeWI,True,I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @...,268,16382,1709327000.0,https://instagram.flba2-1.fna.fbcdn.net/v/t39....,True,...,2144626.0,1021.0,"{'neg': 0.034616627, 'neu': 0.6108251, 'pos': ...",neu,0.610825,0.007764,0.776359,[],2024-03-01,20:59:18


In [40]:
nonlocational_data.isna().sum()

owner_id                 0
owner_username           0
shortcode                0
is_video                 0
caption                  0
comments                 0
likes                    0
created_at               0
imageUrl                 0
multiple_images          0
username                 0
followers                0
following                0
caption_emotions         0
most_powerful_emotion    0
emotion_score            0
engagement_rate          0
engagement_rate(%)       0
hashtag                  0
date                     0
time                     0
dtype: int64

---

# Exporting Data 
Exporting Data on csv file in `prepare data` folder

In [41]:
folder_path = '../3.Prepared Data'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save each DataFrame to a CSV file in the new folder
nonlocational_data.to_csv(os.path.join(folder_path, 'non_locational_data.csv'), index=False)
locational_data.to_csv(os.path.join(folder_path, 'locational_data.csv'), index=False)