# Building a Custom Classifier

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [11]:
data = pd.DataFrame([("i love spending time with my friends and family", "positive"),
                     ("that was the best meal i've ever had in my life", "positive"),
                     ("i feel so grateful for everything i have in my life", "positive"),
                     ("i received a promotion at work and i couldn't be happier", "positive"),
                     ("watching a beautiful sunset always fills me with joy", "positive"),
                     ("my partner surprised me with a thoughtful gift and it made my day", "positive"),
                     ("i am so proud of my daughter for graduating with honors", "positive"),
                     ("listening to my favorite music always puts me in a good mood", "positive"),
                     ("i love the feeling of accomplishment after completing a challenging task", "positive"),
                     ("i am excited to go on vacation next week", "positive"),
                     ("i feel so overwhelmed with work and responsibilities", "negative"),
                     ("the traffic during my commute is always so frustrating", "negative"),
                     ("i received a parking ticket and it ruined my day", "negative"),
                     ("i got into an argument with my partner and we're not speaking", "negative"),
                     ("i have a headache and i feel terrible", "negative"),
                     ("i received a rejection letter for the job i really wanted", "negative"),
                     ("my car broke down and it's going to be expensive to fix", "negative"),
                     ("i'm feeling sad because i miss my friends who live far away", "negative"),
                     ("i'm frustrated because i can't seem to make progress on my project", "negative"),
                     ("i'm disappointed because my team lost the game", "negative")
                    ],
                    columns=['text', 'sentiment'])

In [12]:
data = data.sample(frac=1).reset_index(drop=True)

# data.sample(frac=1) means you are taking 100% of the data but in a randomized order.
# The frac parameter in sample() determines the fraction of rows to return. When frac=1, 
# it means shuffle all the data without dropping any rows.
# .reset_index(drop=True) resets the index after shuffling
# and removes the old index (drop=True ensures the old index is not added as a new column).



In [13]:
# data = data.sample(frac=2, replace=True).reset_index(drop=True)

In [14]:
data.shape

(20, 2)

In [None]:
# If you set frac=2 in data.sample(frac=2), it will attempt to return 200% of the original dataset, 
# meaning it will randomly sample twice the number of rows from data. 

# an error will occur

# The error occurs because when frac > 1, Pandas requires replace=True to allow duplicate sampling. Without it, 
# Pandas cannot sample more rows than exist in the dataset.

In [None]:
# This is commonly used to shuffle the dataset, which is useful in machine learning when you need to randomize data 
# before splitting into training and testing sets.

# Let me know if you need further clarification! 🚀

In [15]:
data.head()

Unnamed: 0,text,sentiment
0,i feel so grateful for everything i have in my...,positive
1,watching a beautiful sunset always fills me wi...,positive
2,the traffic during my commute is always so fru...,negative
3,i am excited to go on vacation next week,positive
4,i love spending time with my friends and family,positive


In [16]:
X = data['text']
y = data['sentiment']

In [17]:
# text vectorization to bow - CountVectorizer
countvec = CountVectorizer()
countvec_fit = countvec.fit_transform(X)
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns = countvec.get_feature_names_out())

In [18]:
bag_of_words

Unnamed: 0,accomplishment,after,always,am,an,and,argument,at,away,be,...,vacation,ve,wanted,was,watching,we,week,who,with,work
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(bag_of_words, y, test_size=0.3, random_state = 7)

In [20]:
X_train.shape

(14, 118)

In [24]:
X_test.shape

(6, 118)

In [21]:
y_train.shape

(14,)

## Logistic Regression

In [22]:
lr = LogisticRegression(random_state=1).fit(X_train, y_train)

In [23]:
y_pred_lr = lr.predict(X_test)

In [25]:
accuracy_score(y_pred_lr, y_test)

0.3333333333333333

In [26]:
y_pred_lr

array(['positive', 'positive', 'negative', 'negative', 'negative',
       'negative'], dtype=object)

In [27]:
y_test

1     positive
17    negative
2     negative
5     positive
11    positive
0     positive
Name: sentiment, dtype: object

In [28]:
print(classification_report(y_test, y_pred_lr, zero_division=0))

              precision    recall  f1-score   support

    negative       0.25      0.50      0.33         2
    positive       0.50      0.25      0.33         4

    accuracy                           0.33         6
   macro avg       0.38      0.38      0.33         6
weighted avg       0.42      0.33      0.33         6



## Naive Bayes

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
nb = MultinomialNB().fit(X_train, y_train)

In [31]:
y_pred_nb = nb.predict(X_test)

In [32]:
accuracy_score(y_pred_nb, y_test)

0.6666666666666666

In [33]:
print(classification_report(y_test, y_pred_nb, zero_division=0))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         2
    positive       0.67      1.00      0.80         4

    accuracy                           0.67         6
   macro avg       0.33      0.50      0.40         6
weighted avg       0.44      0.67      0.53         6



## Linear Support Vector Machine

In [34]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [35]:
svm = SGDClassifier().fit(X_train, y_train)
# possible hyper params, loss function, regularization

In [36]:
y_pred_svm = svm.predict(X_test)

In [37]:
accuracy_score(y_pred_svm, y_test)

0.3333333333333333

In [38]:
print(classification_report(y_test, y_pred_svm, zero_division=0))

              precision    recall  f1-score   support

    negative       0.25      0.50      0.33         2
    positive       0.50      0.25      0.33         4

    accuracy                           0.33         6
   macro avg       0.38      0.38      0.33         6
weighted avg       0.42      0.33      0.33         6

