In [9]:
# Import the pandas library
import pandas as pd

# Open the three data sets
content_df = pd.read_csv('Content.csv')
reaction_types_df = pd.read_csv('ReactionTypes.csv')
reactions_df = pd.read_csv('Reactions.csv')

# Remove rows that have values which are missing
content_df.dropna(inplace=True)
reaction_types_df.dropna(inplace=True)
reactions_df.dropna(inplace=True)

# Change the data type of some values within a column
content_df['Category'] = content_df['Category'].astype(str)
reaction_types_df['Sentiment'] = reaction_types_df['Sentiment'].astype(str)

# Remove columns which are not relevant to this task
content_df = content_df.drop(['Type', 'URL'], axis=1)
reaction_types_df = reaction_types_df.drop(['Score'], axis=1)
reactions_df = reactions_df.drop(['Datetime'], axis=1)

# Print the cleaned data sets
print(content_df)
print(reaction_types_df)
print(reactions_df)

     Unnamed: 0                            Content ID   
0             0  97522e57-d9ab-4bd6-97bf-c24d952602d2  \
1             1  9f737e0a-3cdd-4d29-9d24-753f4e3be810   
2             2  230c4e4d-70c3-461d-b42c-ec09396efb3f   
3             3  356fff80-da4d-4785-9f43-bc1261031dc6   
4             4  01ab84dd-6364-4236-abbb-3f237db77180   
..          ...                                   ...   
994         994  c54b49c4-b1f2-4641-8595-1cbd19937845   
996         996  7a79f4e4-3b7d-44dc-bdef-bc990740252c   
997         997  435007a5-6261-4d8b-b0a4-55fdc189754b   
998         998  4e4c9690-c013-4ee7-9e66-943d8cbd27b7   
999         999  75d6b589-7fae-4a6d-b0d0-752845150e56   

                                  User ID        Category  
0    8d3cd87d-8a31-4935-9a4f-b319bfe05f31        Studying  
1    beb1f34e-7870-46d6-9fc7-2e12eb83ce43  healthy eating  
2    a5c65404-5894-4b87-82f2-d787cbee86b4  healthy eating  
3    9fb4ce88-fac1-406c-8544-1a899cee7aaf      technology  
4    e206e31b-5

In [10]:
# Merge the data sets on the Content ID column
final_df = reactions_df.merge(content_df, on='Content ID')
final_df = final_df.merge(reaction_types_df, on='Type')

In [11]:
final_df.columns

Index(['Unnamed: 0_x', 'Content ID', 'User ID_x', 'Type', 'Unnamed: 0_y',
       'User ID_y', 'Category', 'Unnamed: 0', 'Sentiment'],
      dtype='object')

In [12]:
final_df.head()

Unnamed: 0.1,Unnamed: 0_x,Content ID,User ID_x,Type,Unnamed: 0_y,User ID_y,Category,Unnamed: 0,Sentiment
0,1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,0,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,Studying,2,negative
1,4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,0,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,Studying,2,negative
2,35,97522e57-d9ab-4bd6-97bf-c24d952602d2,13c06e7e-833d-47eb-a790-5e09ccfd8d2c,disgust,0,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,Studying,2,negative
3,52,9f737e0a-3cdd-4d29-9d24-753f4e3be810,8b49caad-bcc5-43de-bf40-34a66ff8805c,disgust,1,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,healthy eating,2,negative
4,88,230c4e4d-70c3-461d-b42c-ec09396efb3f,ef147ea5-9696-44d5-b6c2-a43f62fd8ce2,disgust,2,a5c65404-5894-4b87-82f2-d787cbee86b4,healthy eating,2,negative


In [13]:
final_df['Sentiment'].unique()

array(['negative', 'positive', 'neutral'], dtype=object)

In [14]:
# Create a dictionary to map sentiment values to numeric values
sentiment_mapping = {
    'negative': -1,
    'positive': 1,
    'neutral': 0,
}

# Apply the sentiment mapping to the 'Sentiment' column
final_df['Sentiment'] = final_df['Sentiment'].apply(lambda x: sentiment_mapping[x])

In [15]:
# Calculate the total score for each category
final_df['Total Score'] = final_df['Unnamed: 0'] * final_df['Sentiment']

# Sort the data by total score and get the top 5 performing categories
top_5_categories = final_df.groupby('Category')['Total Score'].sum().sort_values(ascending=False).head(5)

# Print the top 5 categories
print(top_5_categories)

Category
healthy eating     1359
science            1244
education          1040
public speaking     969
food                961
Name: Total Score, dtype: int64
