# Classical Machine Learning on Dataset Using Count Vectorizer

## Libraries Used

In [8]:
# Data engineering libraries
import numpy as np
import pandas as pd

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Spacy libraries
import en_core_web_sm
import spacy
from spacy.lang.en import English

## Loading Data

In [9]:
df = pd.read_csv("/Users/rohansingh/Desktop/df_main.csv")

In [10]:
df_model = df[["Title","Color"]]

## Spacy feature extraction

In [14]:
nlp = spacy.load('en_core_web_sm')

### Verbs

In [16]:
verbs = []

text = ""

for word in df_model["Title"]:
    text = word
    doc = nlp(text)
    verbs.append([token.lemma_ for token in doc if token.pos_ == "VERB"])

temp_verbs = []
for _list in verbs:
    temp_text = ""
    for item in _list:
        temp_text += item
        temp_text += " "
    
    temp_verbs.append(temp_text)

df_model["Verbs"] = temp_verbs
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model["Verbs"] = temp_verbs


### Nouns

In [20]:
nouns = []

text = ""

for word in df_model["Title"]:
    text = word
    doc = nlp(text)
    nouns.append([chunk.text for chunk in doc.noun_chunks])

temp_nouns = []

for _list in nouns:
    temp_text = ""
    for item in _list:
        temp_text += item
        temp_text += " "
    temp_nouns.append(temp_text)

df_model["Nouns"] = temp_nouns

### Adjectives

In [24]:
adjs = []

text = ""

for word in df_model["Title"]:
    text = word
    doc = nlp(text)
    adjs.append([token.lemma_ for token in doc if token.pos_ == "ADJ"])

temp_adjs = []
for _list in adjs:
    temp_text = ""
    for item in _list:
        temp_text += item
        temp_text += " "
    
    temp_adjs.append(temp_text)

df_model["Adjectives"] = temp_adjs

## Modelling

### Verbs

#### Train-Test Split

In [35]:
x = df_model["Verbs"].to_numpy()
y = df_model["Color"].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25)

#### Model

In [36]:
clf_verbs = LogisticRegression()
vec_verbs = CountVectorizer(stop_words='english', ngram_range=(1,3))

pipe_verbs = Pipeline([('vectorizer',vec_verbs),('clf',clf_verbs)])

pipe_verbs.fit(x_train, y_train)

#### Accuracy

In [37]:
preds = pipe_verbs.predict(x_test)
print("accuracy: ", accuracy_score(y_test, preds))

accuracy:  0.5555555555555556


### Nouns

#### Train-Test Split

In [29]:
# For Rachel

#### Model

In [31]:
# For Rachel

#### Accuracy

In [30]:
# For Rachel

### Adjectives

#### Train-Test Split

In [32]:
# For Rachel

#### Model

In [33]:
# For Rachel

#### Accuracy

In [34]:
# For Rachel

## Observations