# Classical Machine Learning on Dataset Using Count Vectorizer

## Libraries Used

In [67]:
# Data engineering libraries
import numpy as np
import pandas as pd

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Spacy libraries
import en_core_web_sm
import spacy
from spacy.lang.en import English

## Loading Data

In [68]:
df = pd.read_csv("general_dataset.csv")
df

Unnamed: 0,Title,Note,Color,Source
0,Delta CEO forecasting highest Q2 earnings ever...,,G,https://www.cnbc.com/2023/06/27/delta-ceo-says...
1,Wise shares spike 16% as higher interest rates...,Wise said in a statement to the stock market t...,G,https://www.cnbc.com/2023/06/27/wise-shares-su...
2,Malaysia’s sovereign wealth fund seeks greater...,"Rising rates will squeeze corporates, particul...",R,https://www.cnbc.com/2023/06/27/malaysias-sove...
3,Uber stock hits highest level since 2022. Here...,,G,https://www.cnbc.com/2023/06/26/uber-stock-hit...
4,Publishers Clearing House to refund customers ...,Publishers Clearing House agreed to settle a l...,R,https://www.cnbc.com/2023/06/26/publishers-cle...
...,...,...,...,...
67,San Francisco Bay Area grocery store takes dra...,Safeway in Vallejo joins other locations in Sa...,R,https://www.foxbusiness.com/retail/san-francis...
68,Fintech firm Klarna halves net loss in first q...,Klarna posted a net loss of 1.3 billion Swedis...,R,https://www.cnbc.com/2023/05/26/klarna-q1-resu...
69,JPMorgan is developing a ChatGPT-like A.I. ser...,JPMorgan Chase is developing a ChatGPT-like so...,G,https://www.cnbc.com/2023/05/25/jpmorgan-devel...
70,Boss of Goldman-backed digital bank Starling t...,"Anne Boden, CEO and co-founder of British digi...",Y,https://www.cnbc.com/2023/05/25/boss-of-goldma...


In [69]:
df_model = df[["Note","Color"]].dropna()

## Spacy feature extraction

In [70]:
nlp = spacy.load('en_core_web_sm')

### Verbs

In [71]:
verbs = []

text = ""

for word in df_model["Note"]:
    text = word
    doc = nlp(text)
    verbs.append([token.lemma_ for token in doc if token.pos_ == "VERB"])

temp_verbs = []
for _list in verbs:
    temp_text = ""
    for item in _list:
        temp_text += item
        temp_text += " "
    
    temp_verbs.append(temp_text)

df_model["Verbs"] = temp_verbs
    

### Nouns

In [72]:
nouns = []

text = ""

for word in df_model["Note"]:
    text = word
    doc = nlp(text)
    nouns.append([chunk.text for chunk in doc.noun_chunks])

temp_nouns = []

for _list in nouns:
    temp_text = ""
    for item in _list:
        temp_text += item
        temp_text += " "
    temp_nouns.append(temp_text)

df_model["Nouns"] = temp_nouns

### Adjectives

In [73]:
adjs = []

text = ""

for word in df_model["Note"]:
    text = word
    doc = nlp(text)
    adjs.append([token.lemma_ for token in doc if token.pos_ == "ADJ"])

temp_adjs = []
for _list in adjs:
    temp_text = ""
    for item in _list:
        temp_text += item
        temp_text += " "
    
    temp_adjs.append(temp_text)

df_model["Adjectives"] = temp_adjs

## Modelling

### Verbs

#### Train-Test Split

In [74]:
x = df_model["Verbs"].to_numpy()
y = df_model["Color"].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25)

#### Model

In [75]:
clf_verbs = LogisticRegression()
vec_verbs = CountVectorizer(stop_words='english', ngram_range=(1,3))

pipe_verbs = Pipeline([('vectorizer',vec_verbs),('clf',clf_verbs)])

pipe_verbs.fit(x_train, y_train)

#### Accuracy

In [76]:
preds = pipe_verbs.predict(x_test)
print("accuracy: ", accuracy_score(y_test, preds))

accuracy:  0.4117647058823529


### Nouns

#### Train-Test Split

In [77]:
x = df_model["Nouns"].to_numpy()
y = df_model["Color"].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25)

#### Model

In [78]:
clf_nouns = LogisticRegression()
vec_nouns = CountVectorizer(stop_words='english', ngram_range=(1,3))

pipe_nouns = Pipeline([('vectorizer',vec_nouns),('clf',clf_verbs)])

pipe_nouns.fit(x_train, y_train)

#### Accuracy

In [79]:
preds = pipe_nouns.predict(x_test)
print("accuracy: ", accuracy_score(y_test, preds))

accuracy:  0.47058823529411764


### Adjectives

#### Train-Test Split

In [80]:
x = df_model["Adjectives"].to_numpy()
y = df_model["Color"].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25)

#### Model

In [81]:
clf_adj = LogisticRegression()
vec_adj = CountVectorizer(stop_words='english', ngram_range=(1,3))

pipe_adj = Pipeline([('vectorizer',vec_adj),('clf',clf_verbs)])

pipe_adj.fit(x_train, y_train)

#### Accuracy

In [82]:
preds = pipe_adj.predict(x_test)
print("accuracy: ", accuracy_score(y_test, preds))

accuracy:  0.5294117647058824


## Observations
adjectives are the best, then nouns, then verbs.