## Minimal example

In [None]:
pip install scikit-learn

In [1]:
sentence = "the quick brown fox jumps over the lazy dog"

from sklearn.feature_extraction.text import CountVectorizer

VOCAB_SIZE = 10

vectorizer = CountVectorizer(max_features=VOCAB_SIZE)

X = vectorizer.fit_transform([sentence]).toarray()

print(dict(zip(vectorizer.get_feature_names(), X[0])))

{'brown': 1, 'dog': 1, 'fox': 1, 'jumps': 1, 'lazy': 1, 'over': 1, 'quick': 1, 'the': 2}


## Another example

In [2]:
import pandas as pd

df = pd.read_csv("../data/tweets.csv")

df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


## Extract relevant feature

In [3]:
tweets = df["text"]

tweets.head()

0    Communal violence in Bhainsa, Telangana. "Ston...
1    Telangana: Section 144 has been imposed in Bha...
2    Arsonist sets cars ablaze at dealership https:...
3    Arsonist sets cars ablaze at dealership https:...
4    "Lord Jesus, your love brings freedom and pard...
Name: text, dtype: object

## Remove stopwords

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import nltk

nltk.download("stopwords")
nltk.download('punkt')

stop_words = set(stopwords.words("english"))  

[nltk_data] Downloading package stopwords to C:\Users\opell.DESKTOP-
[nltk_data]     UEQ8DPV\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\opell.DESKTOP-
[nltk_data]     UEQ8DPV\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
print("Before removing stopwords:")
print(tweets.head())

Before removing stopwords:
0    Communal violence in Bhainsa, Telangana. "Ston...
1    Telangana: Section 144 has been imposed in Bha...
2    Arsonist sets cars ablaze at dealership https:...
3    Arsonist sets cars ablaze at dealership https:...
4    "Lord Jesus, your love brings freedom and pard...
Name: text, dtype: object


In [6]:
import string

tweets = tweets.apply(lambda row: " ".join([word for word in word_tokenize(row) if not word in stop_words and word.isalpha()]))

In [7]:
print("After removing stopwords:")
print(tweets.head())

After removing stopwords:
0    Communal violence Bhainsa Telangana Stones pel...
1    Telangana Section imposed Bhainsa January clas...
2           Arsonist sets cars ablaze dealership https
3     Arsonist sets cars ablaze dealership https https
4    Lord Jesus love brings freedom pardon Fill Hol...
Name: text, dtype: object


## Stemming

In [8]:
from nltk.stem import PorterStemmer  

stemmer = PorterStemmer()

In [9]:
print("Before stemming:")
print(tweets.head())

Before stemming:
0    Communal violence Bhainsa Telangana Stones pel...
1    Telangana Section imposed Bhainsa January clas...
2           Arsonist sets cars ablaze dealership https
3     Arsonist sets cars ablaze dealership https https
4    Lord Jesus love brings freedom pardon Fill Hol...
Name: text, dtype: object


In [10]:
tweets = tweets.apply(stemmer.stem)

In [11]:
print("After stemming:")
print(tweets.head())

After stemming:
0    communal violence bhainsa telangana stones pel...
1    telangana section imposed bhainsa january clas...
2            arsonist sets cars ablaze dealership http
3      arsonist sets cars ablaze dealership https http
4    lord jesus love brings freedom pardon fill hol...
Name: text, dtype: object


## Bag-of-words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

VOCAB_SIZE = 1000

vectorizer = CountVectorizer(max_features=VOCAB_SIZE)

X = vectorizer.fit_transform(tweets).toarray()

In [13]:
X.shape

(11370, 1000)

In [14]:
X.dtype

dtype('int64')

In [15]:
print(vectorizer.get_feature_names())



In [17]:
vectorized = pd.DataFrame(X, columns=vectorizer.get_feature_names())

vectorized.head()

Unnamed: 0,able,absolutely,accident,account,across,act,action,activity,actually,affected,...,yeah,year,years,yes,yesterday,yet,you,young,your,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
type(vectorized)

pandas.core.frame.DataFrame

In [19]:
accident = vectorized.loc[vectorized['accident'] > 1]
accident.head()

Unnamed: 0,able,absolutely,accident,account,across,act,action,activity,actually,affected,...,yeah,year,years,yes,yesterday,yet,you,young,your,zone
135,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
