In [20]:
import pandas as pd
import json

# Turn posts JSON into df
with open("./dm-lab-2-private-competition/final_posts.json", "r") as f:             
    data = json.load(f)

posts = []
for entry in data:
    post = entry["root"]["_source"]["post"]
    posts.append({
        "id": post["post_id"],
        "text": post["text"],
    })

# Convert to dataframe
posts_df = pd.DataFrame(posts)

In [38]:
# Create and finalize the train and test dfs

df = pd.read_csv("./dm-lab-2-private-competition/data_identification.csv")
y = pd.read_csv("./dm-lab-2-private-competition/emotion.csv")    

# Split into train/validation and test dataframes
train_df = df[df['split'] == 'train'].reset_index(drop=True)
test_df = df[df['split'] == 'test'].reset_index(drop=True)

# Add the actual post to the dfs
train_df = train_df.merge(posts_df, on="id", how="left")
test_df  = test_df.merge(posts_df, on="id", how="left")

print("Train/Validation size:", train_df.shape)
print("Test size:", test_df.shape)
display(train_df.head(5))
display(test_df.head(5))

Train/Validation size: (47890, 3)
Test size: (16281, 3)


Unnamed: 0,id,split,text
0,0x35663e,train,I bet there is an army of married couples who ...
1,0xc78afe,train,This could only end badly.
2,0x90089c,train,My sister squeezed a lime in her milk when she...
3,0x2ffb63,train,Thank you so much❤️
4,0x989146,train,Stinks because ive been in this program for a ...


Unnamed: 0,id,split,text
0,0x61fc95,test,"We got the ranch, loaded our guns and sat up t..."
1,0xaba820,test,and that got my head bobbing a little bit.
2,0x66e44d,test,Same. Glad it's not just out store.
3,0xc03cf5,test,Like always i will wait and see thanks for the...
4,0x02f65a,test,"There's a bit of room between ""not loving sub-..."


In [39]:
# Drop the "id" column in the df because it shouldn't be used for 
# training
y = y.drop(columns=["id"])

display(y.head(5))

Unnamed: 0,emotion
0,joy
1,fear
2,joy
3,joy
4,joy


In [40]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_df['text'], y, test_size=0.2, random_state=0)

display(X_train.head(5))
display(y_train.head(5))
display(X_val.head(5))
display(y_val.head(5))

15557    Wow, that really hurts coming from a demented ...
42412    one's a rapist, and the other's a stingy yank ...
47299     They had courts. You sue people who wronged you.
5286           It also happened to be raining on that day.
42066    Isn’t it careless for an official account to p...
Name: text, dtype: object

Unnamed: 0,emotion
15557,anger
42412,joy
47299,joy
5286,anger
42066,anger


6784     Enjoy spending yours oblivious and phony, slow...
44679    now if only there was some sort of *shock them...
19459    You could not see a foot in front of you and I...
47018    More like thinks it's a pipe dream that isn't ...
11631    Levi and I arrived in an eye-capturing city th...
Name: text, dtype: object

Unnamed: 0,emotion
6784,surprise
44679,surprise
19459,fear
47018,disgust
11631,joy


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score


model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=200))
])

model.fit(X_train, y_train)

y_val_pred = model.predict(X_val) 


val_acc = accuracy_score(y_val, y_val_pred)
val_f1w = f1_score(y_val, y_val_pred, average='weighted')

print(f"[Base - Validation] Acc: {val_acc:.4f} | F1-weighted: {val_f1w:.4f}")

  y = column_or_1d(y, warn=True)


[Base - Validation] Acc: 0.6179 | F1-weighted: 0.5805


In [46]:
y_test_pred = model.predict(test_df['text']) 

# Create submission dataframe
submission_df = pd.DataFrame({
    "id": test_df['id'],          # original IDs
    "emotion": y_test_pred         # predicted emotions
})

# Save to CSV
submission_df.to_csv("submission.csv", index=False)

print("submission.csv created successfully!")

submission.csv created successfully!
