In [1]:
import pandas as pd

txt_data = pd.read_csv('text_data.csv')

## Normalize data

In [2]:
txt_data.head()

Unnamed: 0,sentiment,post
0,positive,"""I love technology, especially data science!"""
1,neutral,"""Data science is a part of newer technologies"""
2,negative,"""Technology is a hindrance to our development"""


In [3]:
txt_data['post'] = txt_data['post'].str.capitalize()
txt_data['post']

0     "i love technology, especially data science!"
1    "data science is a part of newer technologies"
2    "technology is a hindrance to our development"
Name: post, dtype: object

In [4]:
txt_data["post"] = txt_data['post'].str.replace('[^\w\s]','')
txt_data["post"]

  txt_data["post"] = txt_data['post'].str.replace('[^\w\s]','')


0       i love technology especially data science
1    data science is a part of newer technologies
2    technology is a hindrance to our development
Name: post, dtype: object

In [5]:
txt_data.loc[txt_data['post'].str.contains('data science'), 
             'Topic'] = 'data science'
txt_data.head()

Unnamed: 0,sentiment,post,Topic
0,positive,i love technology especially data science,data science
1,neutral,data science is a part of newer technologies,data science
2,negative,technology is a hindrance to our development,


## Tokenize data
Note: nltk is not installed in the workspace so the following code won't excute successfully

In [None]:
import nltk
txt_data["token"] = txt_data["post"].apply(nltk.word_tokenize)
txt_data

## Vectorize data

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

words_matrix = vectorizer.fit_transform(txt_data["post"].values)
words_matrix.toarray()

array([[1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0],
       [0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1]])

In [7]:
counts = pd.DataFrame(words_matrix.toarray(),
                      columns=vectorizer.get_feature_names_out())
counts

Unnamed: 0,data,development,especially,hindrance,is,love,newer,of,our,part,science,technologies,technology,to
0,1,0,1,0,0,1,0,0,0,0,1,0,1,0
1,1,0,0,0,1,0,1,1,0,1,1,1,0,0
2,0,1,0,1,1,0,0,0,1,0,0,0,1,1


In [8]:
print(vectorizer.vocabulary_)

{'love': 5, 'technology': 12, 'especially': 2, 'data': 0, 'science': 10, 'is': 4, 'part': 9, 'of': 7, 'newer': 6, 'technologies': 11, 'hindrance': 3, 'to': 13, 'our': 8, 'development': 1}
