## Minimal example

In [None]:
pip install scikit-learn

In [None]:
sentence = "the quick brown fox jumps over the lazy dog"

from sklearn.feature_extraction.text import CountVectorizer

VOCAB_SIZE = 10

vectorizer = CountVectorizer(max_features=VOCAB_SIZE)

X = vectorizer.fit_transform([sentence]).toarray()

print(dict(zip(vectorizer.get_feature_names(), X[0])))

## Another example

In [None]:
import pandas as pd

df = pd.read_csv("../data/tweets.csv")

df.head()

## Extract relevant feature

In [None]:
tweets = df["text"]

tweets.head()

## Remove stopwords

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import nltk

nltk.download("stopwords")
nltk.download('punkt')

stop_words = set(stopwords.words("english"))  

In [None]:
print("Before removing stopwords:")
print(tweets.head())

In [None]:
import string

tweets = tweets.apply(lambda row: " ".join([word for word in word_tokenize(row) if not word in stop_words and word.isalpha()]))

In [None]:
print("After removing stopwords:")
print(tweets.head())

## Stemming

In [None]:
from nltk.stem import PorterStemmer  

stemmer = PorterStemmer()

In [None]:
print("Before stemming:")
print(tweets.head())

In [None]:
tweets = tweets.apply(stemmer.stem)

In [None]:
print("After stemming:")
print(tweets.head())

## Bag-of-words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

VOCAB_SIZE = 1000

vectorizer = CountVectorizer(max_features=VOCAB_SIZE)

X = vectorizer.fit_transform(tweets).toarray()

In [None]:
X.shape

In [None]:
X.dtype

In [None]:
print(vectorizer.get_feature_names())

In [None]:
vectorized = pd.DataFrame(X, columns=vectorizer.get_feature_names())

vectorized.head()

In [None]:
type(vectorized)

In [None]:
accident = vectorized.loc[vectorized['accident'] > 1]
accident.head()