In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
data_id_csv = 'data_identification.csv'
id = pd.read_csv(data_id_csv, sep=',')

print(id.shape)
id.head()

(1867535, 2)


Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train


In [3]:
tweets_json = 'tweets_DM.json'
tweets = pd.read_json(tweets_json, lines=True)

print(tweets.shape)
tweets.head()

(1867535, 5)


Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets


In [4]:
df = tweets.copy()

# Access values within the '_source' column & extract the values in the dictionary
# The 'isinstance' check ensures that the values are dictionaries and that the "tweet" key is present before attempting to access its values
df["hashtags"] = df["_source"].apply(lambda x: x["tweet"]["hashtags"] if isinstance(x, dict) and "tweet" in x else None)
df["tweet_id"] = df["_source"].apply(lambda x: x["tweet"]["tweet_id"] if isinstance(x, dict) and "tweet" in x else None)
df["text"] = df["_source"].apply(lambda x: x["tweet"]["text"] if isinstance(x, dict) and "tweet" in x else None)
df.drop("_source", axis=1, inplace=True)

df.head()

Unnamed: 0,_score,_index,_crawldate,_type,hashtags,tweet_id,text
0,391,hashtag_tweets,2015-05-23 11:42:47,tweets,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,433,hashtag_tweets,2016-01-28 04:52:09,tweets,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,232,hashtag_tweets,2017-12-25 04:39:20,tweets,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k..."
3,376,hashtag_tweets,2016-01-24 23:53:05,tweets,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,989,hashtag_tweets,2016-01-08 17:18:59,tweets,[],0x2de201,"""Trust is not the same as faith. A friend is s..."


In [5]:
# Merge 'data_identification' & 'tweets_DM' into 1 dataframe based on the same 'tweet_id'
df = pd.merge(id, df, on="tweet_id", how="left")
df.head()

Unnamed: 0,tweet_id,identification,_score,_index,_crawldate,_type,hashtags,text
0,0x28cc61,test,107,hashtag_tweets,2017-01-17 14:13:32,tweets,[],@Habbo I've seen two separate colours of the e...
1,0x29e452,train,809,hashtag_tweets,2015-01-17 03:07:03,tweets,[],Huge Respect🖒 @JohnnyVegasReal talking about l...
2,0x2b3819,train,808,hashtag_tweets,2016-07-02 09:34:06,tweets,"[spateradio, app]",Yoooo we hit all our monthly goals with the ne...
3,0x2db41f,test,728,hashtag_tweets,2015-10-17 06:46:20,tweets,[],@FoxNews @KellyannePolls No serious self respe...
4,0x2a2acc,train,16,hashtag_tweets,2016-08-15 18:18:39,tweets,[],@KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...


In [6]:
# Separate dataset into training data & testing data
df_train = df[df['identification'] == "train"]
df_test = df[df['identification'] == "test"]

print(df_train.shape)
print(df_test.shape)

(1455563, 8)
(411972, 8)


In [7]:
# Input the testing data into a csv file for submission
df_test = df_test[['tweet_id', 'text']]
df_test.to_csv('submission.csv', index=False)

In [8]:
emotion_csv = 'emotion.csv'
emotion = pd.read_csv(emotion_csv, sep=',')

print(emotion.shape)
emotion.head()

(1455563, 2)


Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation


In [9]:
# Assign 'emotion' to each 'text' based on the same 'tweet_id'
df_train = pd.merge(df_train, emotion, on="tweet_id", how="left")

print(df_train.shape)
df_train.head()

(1455563, 9)


Unnamed: 0,tweet_id,identification,_score,_index,_crawldate,_type,hashtags,text,emotion
0,0x29e452,train,809,hashtag_tweets,2015-01-17 03:07:03,tweets,[],Huge Respect🖒 @JohnnyVegasReal talking about l...,joy
1,0x2b3819,train,808,hashtag_tweets,2016-07-02 09:34:06,tweets,"[spateradio, app]",Yoooo we hit all our monthly goals with the ne...,joy
2,0x2a2acc,train,16,hashtag_tweets,2016-08-15 18:18:39,tweets,[],@KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...,trust
3,0x2a8830,train,768,hashtag_tweets,2017-02-11 08:49:46,tweets,"[PUBG, GamersUnite, twitch, BeHealthy, StayPos...",Come join @ambushman27 on #PUBG while he striv...,joy
4,0x20b21d,train,70,hashtag_tweets,2016-11-23 05:37:10,tweets,"[strength, bones, God]",@fanshixieen2014 Blessings!My #strength little...,anticipation


In [10]:
# Encode the 8 emotions into 0 to 7 label
label_encoder = LabelEncoder()
df_train['emotion_enc'] = label_encoder.fit_transform(df_train['emotion'])

df_summary = pd.DataFrame({
    'emotion': df_train['emotion'],
    'emotion_enc': df_train['emotion_enc']
})

df_summary['count'] = df_summary['emotion_enc'].map(df_summary['emotion_enc'].value_counts())
df_summary = df_summary[['emotion', 'emotion_enc', 'count']].drop_duplicates(keep='first')

df_summary


Unnamed: 0,emotion,emotion_enc,count
0,joy,4,516017
2,trust,7,205478
4,anticipation,1,248935
8,sadness,5,193437
13,disgust,2,139101
45,fear,3,63999
58,surprise,6,48729
68,anger,0,39867


In [15]:
# Select only columns required to train my model 
selected_columns = ['text', 'emotion', 'emotion_enc']
df_selected = df_train[selected_columns]

# Sample my data since the dataset is huge
# data_df = df_selected.sample(n=50000,random_state=42) #random state
# display(data_df)

In [12]:
# # Set the number of samples per category
# category_counts = df_selected['emotion_enc'].value_counts()
# samples_per_category = 10000

# # Function to sample 10000 data points from each category
# def sample_from_category(group):
#     return group.sample(min(samples_per_category, len(group)), random_state=42)

# # Apply the function to each group
# data_df = df_selected.groupby('emotion_enc', group_keys=False).apply(sample_from_category)
# display(data_df)

Unnamed: 0,text,emotion,emotion_enc
1294668,I'm in that wierd moon that's halfway between ...,anger,0
404235,Overly <LH>,anger,0
1241364,@Muzzaraw Coolest thing that's happened in the...,anger,0
470944,We just threw it away on 4th down.. & Tony Rom...,anger,0
45918,@HouseofCommons Stop arguing and get on with i...,anger,0
...,...,...,...
1111222,Thank you God for the success of the surgery. ...,trust,7
1105198,@cafecoho I'm so <LH> you do <LH> options but...,trust,7
1371290,It’s always a good thing when the heat comes o...,trust,7
1208007,As the end of the <LH> season approaches remem...,trust,7


In [13]:
# First, split into training and temporary dataset
train_df, temp_df = train_test_split(data_df, test_size=0.1, random_state=42)

# Next, split the temporary dataset into validation and test sets
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(train_df.shape)
print(val_df.shape)
print(test_df.shape)


(72000, 3)
(4000, 3)
(4000, 3)


In [14]:
# Output my datasets into a folder
train_df.to_csv("./datasets/training_set_20231217.csv",index=False)
val_df.to_csv("./datasets/validation_set_20231217.csv",index=False)
test_df.to_csv("./datasets/testing_set_20231217.csv",index=False)