In [1]:
import pandas as pd

csv_path = 'emotion_data.csv'

df = pd.read_csv(csv_path)

print("First 5 rows:")
print(df.head())

print("\nColumns:")
print(df.columns.tolist())

print(f"\nNumber of samples: {len(df)}")


First 5 rows:
                        id                                               text  \
0  eng_train_track_a_00001                       Colorado, middle of nowhere.   
1  eng_train_track_a_00002  This involved swimming a pretty large lake tha...   
2  eng_train_track_a_00003        It was one of my most shameful experiences.   
3  eng_train_track_a_00004  After all, I had vegetables coming out my ears...   
4  eng_train_track_a_00005                        Then the screaming started.   

   anger  fear  joy  sadness  surprise  
0      0     1    0        0         1  
1      0     1    0        0         0  
2      0     1    0        1         0  
3      0     0    0        0         0  
4      0     1    0        1         1  

Columns:
['id', 'text', 'anger', 'fear', 'joy', 'sadness', 'surprise']

Number of samples: 2768


In [2]:
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)

texts = df['clean_text']
labels = df[['anger', 'fear', 'joy', 'sadness', 'surprise']].values

print(texts.head())


[nltk_data] Downloading package stopwords to C:\Users\BASAVADEEPTHI H
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\BASAVADEEPTHI H
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0                      colorado middle nowhere
1     involved swimming pretty large lake head
2                      one shameful experience
3    vegetable coming ear benefit young prince
4                            screaming started
Name: clean_text, dtype: object


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),      
    sublinear_tf=True,
    min_df=2,                
    max_df=0.95             
)
X = vectorizer.fit_transform(df['clean_text'])


In [4]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

svm = LinearSVC(class_weight='balanced')
clf = OneVsRestClassifier(svm)

clf.fit(X, labels)
print("Training complete!")


Training complete!


In [5]:
import joblib

joblib.dump(clf, 'svm_model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')
print("Model and vectorizer saved!")


Model and vectorizer saved!
