In [1]:
import pandas as pd

# Load the datasets
training_data_path = 'training.csv'
validation_data_path = 'validation.csv'
test_data_path = 'test.csv'

# Read the CSV files
training_data = pd.read_csv(training_data_path)
validation_data = pd.read_csv(validation_data_path)
test_data = pd.read_csv(test_data_path)

# Display the first few rows of each dataset to understand the structure
(training_data.head(), validation_data.head(), test_data.head())

(                                                text  label
 0                            i didnt feel humiliated      0
 1  i can go from feeling so hopeless to so damned...      0
 2   im grabbing a minute to post i feel greedy wrong      3
 3  i am ever feeling nostalgic about the fireplac...      2
 4                               i am feeling grouchy      3,
                                                 text  label
 0  im feeling quite sad and sorry for myself but ...      0
 1  i feel like i am still looking at a blank canv...      0
 2                     i feel like a faithful servant      2
 3                  i am just feeling cranky and blue      3
 4  i can have for a treat or if i am feeling festive      1,
                                                 text  label
 0  im feeling rather rotten so im not very ambiti...      0
 1          im updating my blog because i feel shitty      0
 2  i never make her separate from me because i do...      0
 3  i left with my bou

In [2]:
training_data["label"].value_counts()

label
1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: count, dtype: int64

In [3]:
training_data

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [4]:
validation_data

Unnamed: 0,text,label
0,im feeling quite sad and sorry for myself but ...,0
1,i feel like i am still looking at a blank canv...,0
2,i feel like a faithful servant,2
3,i am just feeling cranky and blue,3
4,i can have for a treat or if i am feeling festive,1
...,...,...
1995,im having ssa examination tomorrow in the morn...,0
1996,i constantly worry about their fight against n...,1
1997,i feel its important to share this info for th...,1
1998,i truly feel that if you are passionate enough...,1


In [5]:
test_data

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0
...,...,...
1995,i just keep feeling like someone is being unki...,3
1996,im feeling a little cranky negative after this...,3
1997,i feel that i am useful to my people and that ...,1
1998,im feeling more comfortable with derby i feel ...,1


In [6]:
training_data = training_data.drop_duplicates()
validation_data = validation_data.drop_duplicates()
test_data = test_data.drop_duplicates()

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
import scipy.sparse as sp
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import os

lemmatizer = WordNetLemmatizer()

# Define the preprocessing function including stop words removal
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Lemmatization and handling negations
    prev_word = ""
    processed_tokens = []
    for word in tokens:
        if word in ENGLISH_STOP_WORDS:
            continue
        if word == "not":
            prev_word = "not_"
        else:
            if prev_word == "not_":
                word = prev_word + word
                prev_word = ""
            word = lemmatizer.lemmatize(word)
            # Remove punctuation and numbers
            word = re.sub(r'[^\w\s]', '', word)
            word = re.sub(r'\d+', '', word)
            processed_tokens.append(word)
    return ' '.join(processed_tokens)

# Load the datasets again
training_data = pd.read_csv('training.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

# Apply the preprocessing to the text data
training_data['text'] = training_data['text'].apply(preprocess_text)
validation_data['text'] = validation_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

# Initialize TF-IDF Vectorizer without max_features to keep all words
# Configure the TF-IDF vectorizer to include bi-grams and tri-grams and to ignore rare words that appear in less than two documents.
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=2)

# Fit the vectorizer on the training text data and transform all datasets
tfidf_vectorizer.fit(training_data['text'])
training_data_tfidf = tfidf_vectorizer.transform(training_data['text'])
validation_data_tfidf = tfidf_vectorizer.transform(validation_data['text'])
test_data_tfidf = tfidf_vectorizer.transform(test_data['text'])

# Save the TF-IDF data as .npz files since they are in sparse format
preprocessed_data_dir = 'Preprocessed Data/'
os.makedirs(preprocessed_data_dir, exist_ok=True)

# Define file paths for the TF-IDF data
training_data_tfidf_file = os.path.join(preprocessed_data_dir, 'training_tfidf.npz')
validation_data_tfidf_file = os.path.join(preprocessed_data_dir, 'validation_tfidf.npz')
test_data_tfidf_file = os.path.join(preprocessed_data_dir, 'test_tfidf.npz')

# Save the TF-IDF data
sp.save_npz(training_data_tfidf_file, training_data_tfidf)
sp.save_npz(validation_data_tfidf_file, validation_data_tfidf)
sp.save_npz(test_data_tfidf_file, test_data_tfidf)

# Return the file paths for confirmation
(training_data_tfidf_file, validation_data_tfidf_file, test_data_tfidf_file)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annarjun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/annarjun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/annarjun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


('Preprocessed Data/training_tfidf.npz',
 'Preprocessed Data/validation_tfidf.npz',
 'Preprocessed Data/test_tfidf.npz')

#### SVC Baseline Model

In [8]:
import numpy as np
import os
import scipy.sparse as sp
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# File paths for TF-IDF data
training_data_tfidf_file = 'training_tfidf.npz'
validation_data_tfidf_file = 'validation_tfidf.npz'
test_data_tfidf_file = 'test_tfidf.npz'

# Load the original data with labels
original_training_data = pd.read_csv('training.csv')  
original_validation_data = pd.read_csv('validation.csv') 
original_test_data = pd.read_csv('test.csv')  

# Extract labels from the original data
training_labels = original_training_data['label']
validation_labels = original_validation_data['label']
test_labels = original_test_data['label']

# Load the TF-IDF data
training_data_tfidf = sp.load_npz(training_data_tfidf_file)
validation_data_tfidf = sp.load_npz(validation_data_tfidf_file)
test_data_tfidf = sp.load_npz(test_data_tfidf_file)

# Initialize the Support Vector classifier
svc = SVC()

# Train the classifier
svc.fit(training_data_tfidf, training_labels)

# Predict on validation and test data
validation_predictions = svc.predict(validation_data_tfidf)
test_predictions = svc.predict(test_data_tfidf)

# Evaluate the classifier
print("Validation Set Performance:")
print(classification_report(validation_labels, validation_predictions))
print("Accuracy:", accuracy_score(validation_labels, validation_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(validation_labels, validation_predictions))

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.87      0.94      0.91       550
           1       0.86      0.97      0.91       704
           2       0.92      0.68      0.78       178
           3       0.94      0.83      0.88       275
           4       0.86      0.78      0.82       212
           5       0.96      0.62      0.75        81

    accuracy                           0.88      2000
   macro avg       0.90      0.80      0.84      2000
weighted avg       0.89      0.88      0.88      2000

Accuracy: 0.8815

Confusion Matrix:
[[518  17   2   5   7   1]
 [ 13 680   7   1   3   0]
 [ 10  45 121   2   0   0]
 [ 24  17   1 228   5   0]
 [ 22  17   0   6 166   1]
 [  7  11   0   1  12  50]]


#### SVC Hyperparameter Tuning using GridSearchCV

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load your dataset
original_training_data = pd.read_csv('training.csv')  
original_validation_data = pd.read_csv('validation.csv') 
original_test_data = pd.read_csv('test.csv')  

# Assuming your dataset has 'text' column for the input text and 'label' column for the labels
text_data = original_training_data['text']
labels = original_training_data['label']

# Split the dataset into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(text_data, labels, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  
train_tfidf = vectorizer.fit_transform(train_texts)
val_tfidf = vectorizer.transform(val_texts)
test_tfidf = vectorizer.transform(test_texts)

# Convert TF-IDF matrices to pandas DataFrames
train_tfidf_df = pd.DataFrame(train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
val_tfidf_df = pd.DataFrame(val_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Display the resulting DataFrames
print("Train TF-IDF DataFrame")
display(train_tfidf_df)

print("\nValidation TF-IDF DataFrame")
display(val_tfidf_df)

# Display the Test TF-IDF DataFrame
print("\nTest TF-IDF DataFrame")
display(test_tfidf_df)

# Initialize the Support Vector classifier
svc = SVC()

# Define the parameter grid to search
param_grid = {
    'C' : [1, 10, 100],
    'kernel' : ['linear','rbf']
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(train_tfidf, train_labels)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("\nBest Hyperparameters:")
print(best_params)

# Train the classifier with the best hyperparameters
best_svc = SVC(C = best_params['C'], kernel=best_params['kernel'])
best_svc.fit(train_tfidf, train_labels)

# Predict on validation data
val_predictions = best_svc.predict(val_tfidf)

# Evaluate the classifier on the validation set
print("\nValidation Set Performance")
print(classification_report(val_labels, val_predictions))
print("Accuracy:", accuracy_score(val_labels, val_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

# Predict on test data
test_predictions = best_svc.predict(test_tfidf)

# Evaluate the classifier on the test set
print("\nTest Set Performance")
print(classification_report(test_labels, test_predictions))
print("Accuracy:", accuracy_score(test_labels, test_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

Train TF-IDF DataFrame


Unnamed: 0,aa,abandon,abandoned,abc,abdomen,abilities,ability,abit,able,about,...,yours,yourself,youth,youtube,youve,zealand,zero,zombie,zone,zumba
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Validation TF-IDF DataFrame


Unnamed: 0,aa,abandon,abandoned,abc,abdomen,abilities,ability,abit,able,about,...,yours,yourself,youth,youtube,youve,zealand,zero,zombie,zone,zumba
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Test TF-IDF DataFrame


Unnamed: 0,aa,abandon,abandoned,abc,abdomen,abilities,ability,abit,able,about,...,yours,yourself,youth,youtube,youve,zealand,zero,zombie,zone,zumba
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Best Hyperparameters:
{'C': 1, 'kernel': 'linear'}

Validation Set Performance
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       458
           1       0.87      0.94      0.90       528
           2       0.85      0.66      0.75       140
           3       0.88      0.88      0.88       221
           4       0.84      0.81      0.83       203
           5       0.82      0.66      0.73        50

    accuracy                           0.88      1600
   macro avg       0.87      0.81      0.84      1600
weighted avg       0.88      0.88      0.88      1600

Accuracy: 0.88

Confusion Matrix:
[[425   7   5  11  10   0]
 [ 14 498   8   3   3   2]
 [  2  42  93   2   1   0]
 [ 12  11   0 194   4   0]
 [  9  12   2  10 165   5]
 [  0   3   1   0  13  33]]

Test Set Performance
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       488
           1       0.86      0.94      0.90      

In [10]:
import joblib
joblib.dump(svc, 'svc_model.joblib')

['svc_model.joblib']