# Imports + Background

In [1]:
import pandas as pd
import numpy as np
from joblib import load, dump
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Text classification and sentiment analysis is a very large subsection of NLP that has numerous applications. In this notebook we will be seeeing how you can use Count Vectorizer and TFIDF vectorizer to extract features from text. In subsequent notebooks we will create models using these features, and examining better ways to extract features from text

# Loading The Data

In [7]:
df = pd.read_csv("Data/FinancialNewsData.csv", encoding='Windows-1252', names = ["label", "headline"])

In [None]:
df.head(3)

In [9]:
df.label = df.label.map({'negative': 0, 'neutral': 1,'positive': 2})

In [11]:
df.label.value_counts()

1    2879
2    1363
0     604
Name: label, dtype: int64

In [None]:
df.head(3)

There are many more neutral examples than positive and negative examples. This class imbalence may be something I have to deal with during the modeling process.

# Bag of Words and TFIDF Approaches

In [13]:
cv = CountVectorizer()
tfidf = TfidfVectorizer()

In [16]:
bag_of_words = cv.fit_transform(df.headline).toarray()

In [23]:
bag_of_words = pd.DataFrame(bag_of_words, columns=cv.get_feature_names())

In [24]:
tfidf_features = tfidf.fit_transform(df.headline).toarray()

In [28]:
tfidf_features = pd.DataFrame(tfidf_features, columns=tfsidf.get_feature_names())

In [31]:
tfidf_features.iloc[0:50:5, 0:10000:400]

Unnamed: 0,00,3mn,accounts,art,board,chart,convergent,dinner,engineer,explained,...,monate,oka,pet,protocol,replaces,scantrack,soared,surpassing,tornstrom,vauramo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
bag_of_words.iloc[0::400, 0:10000:500]

Unnamed: 0,00,565,allowed,benefits,chart,crew,dynamic,euro603,fulfilled,housewares,juha,located,monate,originally,postel,registers,scantrack,split,telko,usa
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1600,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Saving the Data

In [40]:
_ = dump(bag_of_words, "Objects/bow.joblib")
_ = dump(tfidf_features, "Objects/tfidf.joblib")