In [1]:
#import libraries 

# Run this cell without changes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, silhouette_score



In [2]:
#loading dataset
df = pd.read_csv("../data/judge-1377884607_tweet_product_company.csv",encoding="latin1")
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [3]:
#renaming columns
df.columns = ['tweet', 'product', 'sentiment']
df.head()


Unnamed: 0,tweet,product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
#details of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      9092 non-null   object
 1   product    3291 non-null   object
 2   sentiment  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [5]:
#missing values
df.isnull().sum()

tweet           1
product      5802
sentiment       0
dtype: int64

In [6]:
# Removing the one row with missing tweet text (can't analyze what doesn't exist)
df = df.dropna(subset=['tweet'])

# For missing products, we'll fill with 'Unknown' rather than dropping
# These tweets still have sentiment and can be valuable for analysis
df['product'] = df['product'].fillna('Unknown')

In [7]:
#removing duplicates
df = df.drop_duplicates()

In [8]:
# Examine sentiment distribution
sentiment_counts = df['sentiment'].value_counts()
sentiment_percentages = df['sentiment'].value_counts(normalize=True) * 100

sentiment_summary = pd.DataFrame({
    'Count': sentiment_counts,
    'Percentage': sentiment_percentages.round(2)
})
print(sentiment_summary)

                                    Count  Percentage
sentiment                                            
No emotion toward brand or product   5375       59.26
Positive emotion                     2970       32.75
Negative emotion                      569        6.27
I can't tell                          156        1.72


In [9]:
#preparing/combining data for modeling
# Creating a simplified sentiment mapping
sentiment_mapping = {
    'Positive emotion': 'Positive',
    'Negative emotion': 'Negative',
    'No emotion toward brand or product': 'Neutral',
    "I can't tell": 'Neutral'
}

df['sentiment_clean'] = df['sentiment'].map(sentiment_mapping)
# Verify the mapping
sentiment_counts = df['sentiment_clean'].value_counts()
sentiment_percentages = df['sentiment_clean'].value_counts(normalize=True).mul(100).round(2)
df.head()

Unnamed: 0,tweet,product,sentiment,sentiment_clean
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Positive
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Positive
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,Negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Positive


In [11]:
#exploring data analysis
# Examine sentiment distribution after cleaning
# adding tweet characteristics; length and word count


df['tweet_length'] = df['tweet'].astype(str).apply(len)
df['word_count'] = df['tweet'].astype(str).apply(lambda x: len(x.split()))
df.head()

Unnamed: 0,tweet,product,sentiment,sentiment_clean,tweet_length,word_count
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Negative,127,23
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Positive,139,22
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Positive,79,15
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,Negative,82,15
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Positive,131,17


In [None]:
# Analyzing by sentiment
length_by_sentiment = df.groupby('sentiment_clean')[['tweet_length', 'word_count']].agg(['mean', 'median'])
print("Tweet Characteristics by Sentiment:")

In [None]:
#product analysis