# Data Cleaning

## Import packages

In [16]:
# Data manipulation
import numpy as np
import pandas as pd
import datetime as dt

# Data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Shows plots in jupyter notebook
%matplotlib inline

# Set plot style
sns.set(color_codes=True)

# Ignore warning
import warnings
warnings.filterwarnings('ignore')

In [17]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [18]:
print('NumPy', np.__version__)
print('Pandas', pd.__version__)
print('Matplotlib', mpl.__version__)
print('Seaborn', sns.__version__)

NumPy 1.23.5
Pandas 2.0.1
Matplotlib 3.7.1
Seaborn 0.12.2


---
## Loading data with Pandas

In [44]:
# Open the three data sets
content_df = pd.read_csv('Content.csv', index_col=0)
reaction_types_df = pd.read_csv('ReactionTypes.csv', index_col=0)
reactions_df = pd.read_csv('Reactions.csv', index_col=0)

In [45]:
content_df.head()

Unnamed: 0,Content ID,User ID,Type,Category,URL
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


In [46]:
reaction_types_df.head()

Unnamed: 0,Type,Sentiment,Score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30


In [47]:
reactions_df.head()

Unnamed: 0,Content ID,User ID,Type,Datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,2021-01-06 19:13:01


---
## Dimensions of data

In [48]:
content_df.shape

(1000, 5)

In [49]:
print('Client data dimension')
print(f'- Number of rows: {content_df.shape[0]}')
print(f'- Number of cols: {content_df.shape[1]}')

Client data dimension
- Number of rows: 1000
- Number of cols: 5


In [50]:
reaction_types_df.shape

(16, 3)

In [51]:
print('Client data dimension')
print(f'- Number of rows: {reaction_types_df.shape[0]}')
print(f'- Number of cols: {reaction_types_df.shape[1]}')

Client data dimension
- Number of rows: 16
- Number of cols: 3


In [52]:
reactions_df.shape

(25553, 4)

In [53]:
print('Client data dimension')
print(f'- Number of rows: {reactions_df.shape[0]}')
print(f'- Number of cols: {reactions_df.shape[1]}')

Client data dimension
- Number of rows: 25553
- Number of cols: 4


---
## Descriptive statistics of data

### Data types of columns

In [54]:
content_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Content ID  1000 non-null   object
 1   User ID     1000 non-null   object
 2   Type        1000 non-null   object
 3   Category    1000 non-null   object
 4   URL         801 non-null    object
dtypes: object(5)
memory usage: 46.9+ KB


In [55]:
reaction_types_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Type       16 non-null     object
 1   Sentiment  16 non-null     object
 2   Score      16 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 512.0+ bytes


In [56]:
reactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25553 entries, 0 to 25552
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Content ID  25553 non-null  object
 1   User ID     22534 non-null  object
 2   Type        24573 non-null  object
 3   Datetime    25553 non-null  object
dtypes: object(4)
memory usage: 998.2+ KB


### Statistics

In [23]:
# Remove rows that have values which are missing
content_df.dropna(inplace=True)
reaction_types_df.dropna(inplace=True)
reactions_df.dropna(inplace=True)

# Change the data type of some values within a column
content_df['Category'] = content_df['Category'].astype(str)
reaction_types_df['Sentiment'] = reaction_types_df['Sentiment'].astype(str)

# Remove columns which are not relevant to this task
content_df = content_df.drop(['Type', 'URL'], axis=1)
reaction_types_df = reaction_types_df.drop(['Score'], axis=1)
reactions_df = reactions_df.drop(['Datetime'], axis=1)

# Print the cleaned data sets
print(content_df.head())
print(reaction_types_df.head())
print(reactions_df.head())

     Unnamed: 0                            Content ID   
0             0  97522e57-d9ab-4bd6-97bf-c24d952602d2  \
1             1  9f737e0a-3cdd-4d29-9d24-753f4e3be810   
2             2  230c4e4d-70c3-461d-b42c-ec09396efb3f   
3             3  356fff80-da4d-4785-9f43-bc1261031dc6   
4             4  01ab84dd-6364-4236-abbb-3f237db77180   
6             6  3f8590c7-6ab2-4973-805a-90cdec355f05   
7             7  e5490118-90d5-4572-ab1c-1fbc87b8d9ca   
8             8  0bedca96-fb76-4287-a83c-17330ed39cce   
9             9  b18cb63f-4c8e-44ee-a47f-541e95191d11   
11           11  46fb701d-6c26-458e-ada3-2ebe5dbba01f   
12           12  0be59876-d70c-486c-8e0b-a06bef7a2cd6   
13           13  81abd65a-3b76-4574-a0a7-db6bf7184ae2   
14           14  e6ee2244-9382-49a9-8cbf-fa54aaaa2392   
16           16  f332d362-dc48-46c2-a64b-641157c0987e   
17           17  a2e93b29-9259-4092-a4d7-62d5e823bb74   
18           18  9b3a6d30-48e8-476c-82be-9031524bd04d   
19           19  850fe90d-47d6-

In [24]:
# Merge the data sets on the Content ID column
final_df = reactions_df.merge(content_df, on='Content ID')
final_df = final_df.merge(reaction_types_df, on='Type')

In [25]:
final_df.columns

Index(['Unnamed: 0_x', 'Content ID', 'User ID_x', 'Type', 'Unnamed: 0_y',
       'User ID_y', 'Category', 'Unnamed: 0', 'Sentiment'],
      dtype='object')

In [26]:
final_df.head()

Unnamed: 0.1,Unnamed: 0_x,Content ID,User ID_x,Type,Unnamed: 0_y,User ID_y,Category,Unnamed: 0,Sentiment
0,1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,0,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,Studying,2,negative
1,4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,0,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,Studying,2,negative
2,35,97522e57-d9ab-4bd6-97bf-c24d952602d2,13c06e7e-833d-47eb-a790-5e09ccfd8d2c,disgust,0,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,Studying,2,negative
3,52,9f737e0a-3cdd-4d29-9d24-753f4e3be810,8b49caad-bcc5-43de-bf40-34a66ff8805c,disgust,1,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,healthy eating,2,negative
4,88,230c4e4d-70c3-461d-b42c-ec09396efb3f,ef147ea5-9696-44d5-b6c2-a43f62fd8ce2,disgust,2,a5c65404-5894-4b87-82f2-d787cbee86b4,healthy eating,2,negative


In [27]:
final_df['Sentiment'].unique()

array(['negative', 'positive', 'neutral'], dtype=object)

In [28]:
# Create a dictionary to map sentiment values to numeric values
sentiment_mapping = {
    'negative': -1,
    'positive': 1,
    'neutral': 0,
}

# Apply the sentiment mapping to the 'Sentiment' column
final_df['Sentiment'] = final_df['Sentiment'].apply(lambda x: sentiment_mapping[x])

In [29]:
# Calculate the total score for each category
final_df['Total Score'] = final_df['Unnamed: 0'] * final_df['Sentiment']

# Sort the data by total score and get the top 5 performing categories
top_5_categories = final_df.groupby('Category')['Total Score'].sum().sort_values(ascending=False).head(5)

# Print the top 5 categories
print(top_5_categories)

Category
healthy eating     1359
science            1244
education          1040
public speaking     969
food                961
Name: Total Score, dtype: int64
