# Dealing and Processing Text Dataset
Data : US President Inagural Speech

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
path = 'data/inaugural_speeches.csv'

speech_df = pd.read_csv(path)
speech_df.head()

Unnamed: 0,Name,Inaugural Address,Date,text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t..."
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica..."


In [3]:
# Removing unwanted characters
speech_df['text'] = speech_df['text'].str.replace('[^a-zA-Z]',' ')
speech_df.head()

Unnamed: 0,Name,Inaugural Address,Date,text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow Citizens of the Senate and of the House...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens I AM again called upon by th...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797",WHEN it was first perceived in early times t...
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow Citizens CALLED upon to u...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805",PROCEEDING fellow citizens to that qualifica...


In [4]:
# standardize by lower all string
speech_df['text'] = speech_df['text'].str.lower()
speech_df.head()

Unnamed: 0,Name,Inaugural Address,Date,text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",fellow citizens of the senate and of the house...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",fellow citizens i am again called upon by th...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797",when it was first perceived in early times t...
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",friends and fellow citizens called upon to u...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805",proceeding fellow citizens to that qualifica...


In [5]:
# create character count column
speech_df['char_count'] = speech_df['text'].str.len()
speech_df['char_count'].head()

0     8616
1      787
2    13871
3    10144
4    12902
Name: char_count, dtype: int64

In [7]:
# create word count column
speech_df['word_count'] = speech_df['text'].str.split().str.len()
speech_df['word_count'].head()

0    1432
1     135
2    2323
3    1736
4    2169
Name: word_count, dtype: int64

In [8]:
# create average word length (char)
speech_df['avg_word_len'] = speech_df['char_count'] / speech_df['word_count']
speech_df['avg_word_len'].head()

0    6.016760
1    5.829630
2    5.971158
3    5.843318
4    5.948363
Name: avg_word_len, dtype: float64

## Word Count

In [18]:
# import library
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.2, max_df=0.8)

# fit and transform data
text_transform = cv.fit_transform(speech_df['text'])

In [19]:
# cv dataframe
cv_df = pd.DataFrame(text_transform.toarray(), columns=cv.get_feature_names()).add_prefix('Counts_')

cv_df.head()

Unnamed: 0,Counts_abiding,Counts_ability,Counts_able,Counts_about,Counts_above,Counts_abroad,Counts_accept,Counts_accomplished,Counts_achieve,Counts_across,...,Counts_women,Counts_words,Counts_work,Counts_wrong,Counts_year,Counts_years,Counts_yet,Counts_you,Counts_young,Counts_your
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,5,0,9
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,2,3,0,0,0,1
3,0,0,0,1,1,1,0,0,0,0,...,0,0,1,2,0,0,2,7,0,7
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,4,0,4


In [20]:
# combine df
combine_df = pd.concat([speech_df, cv_df], axis=1, sort=False)
combine_df.shape

(58, 825)

In [22]:
combine_df.head()

Unnamed: 0,Name,Inaugural Address,Date,text,char_count,word_count,avg_word_len,Counts_abiding,Counts_ability,Counts_able,...,Counts_women,Counts_words,Counts_work,Counts_wrong,Counts_year,Counts_years,Counts_yet,Counts_you,Counts_young,Counts_your
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",fellow citizens of the senate and of the house...,8616,1432,6.01676,0,0,0,...,0,0,0,0,0,1,0,5,0,9
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",fellow citizens i am again called upon by th...,787,135,5.82963,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,John Adams,Inaugural Address,"Saturday, March 4, 1797",when it was first perceived in early times t...,13871,2323,5.971158,0,0,0,...,0,0,0,0,2,3,0,0,0,1
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",friends and fellow citizens called upon to u...,10144,1736,5.843318,0,0,0,...,0,0,1,2,0,0,2,7,0,7
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805",proceeding fellow citizens to that qualifica...,12902,2169,5.948363,0,0,1,...,0,0,0,0,2,2,2,4,0,4


## Term frequency-inverse document frequency (Tf-idf)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(max_features=100, stop_words='english')

tv_transformed = tv.fit_transform(speech_df['text'])

In [31]:
tv_df = pd.DataFrame(tv_transformed.toarray(), columns=tv.get_feature_names()).add_prefix('TFIDF_')

tv_df.head()

Unnamed: 0,TFIDF_action,TFIDF_administration,TFIDF_america,TFIDF_american,TFIDF_americans,TFIDF_believe,TFIDF_best,TFIDF_better,TFIDF_change,TFIDF_citizens,...,TFIDF_things,TFIDF_time,TFIDF_today,TFIDF_union,TFIDF_united,TFIDF_war,TFIDF_way,TFIDF_work,TFIDF_world,TFIDF_years
0,0.0,0.133415,0.0,0.105388,0.0,0.0,0.0,0.0,0.0,0.229644,...,0.0,0.045929,0.0,0.136012,0.203593,0.0,0.060755,0.0,0.045929,0.052694
1,0.0,0.261016,0.266097,0.0,0.0,0.0,0.0,0.0,0.0,0.179712,...,0.0,0.0,0.0,0.0,0.199157,0.0,0.0,0.0,0.0,0.0
2,0.0,0.092436,0.157058,0.073018,0.0,0.0,0.026112,0.06046,0.0,0.106072,...,0.03203,0.021214,0.0,0.062823,0.070529,0.024339,0.0,0.0,0.063643,0.073018
3,0.0,0.092693,0.0,0.0,0.0,0.090942,0.117831,0.045471,0.053335,0.223369,...,0.048179,0.0,0.0,0.094497,0.0,0.03661,0.0,0.039277,0.095729,0.0
4,0.041334,0.039761,0.0,0.031408,0.0,0.0,0.067393,0.039011,0.091514,0.27376,...,0.082667,0.164256,0.0,0.121605,0.030338,0.094225,0.0,0.0,0.054752,0.062817


In [34]:
# top 10 words used
tv_df.iloc[0].sort_values(ascending=False).head(10)

TFIDF_government    0.367430
TFIDF_public        0.333237
TFIDF_present       0.315182
TFIDF_duty          0.238637
TFIDF_citizens      0.229644
TFIDF_country       0.229644
TFIDF_united        0.203593
TFIDF_far           0.178978
TFIDF_people        0.174590
TFIDF_good          0.147528
Name: 0, dtype: float64

## Bag of words and N-grams

In [35]:
bigrams_model = CountVectorizer(max_features=100, stop_words='english', ngram_range= (2,2))

bigrams = bigrams_model.fit_transform(speech_df['text'])

In [38]:
bigrams_df = pd.DataFrame(bigrams.toarray(), columns=bigrams_model.get_feature_names()).add_prefix('Counts_')

# most used bigrams
bigrams_df.sum().sort_values(ascending=False).head(10)

Counts_united states          157
Counts_fellow citizens        111
Counts_american people         41
Counts_federal government      35
Counts_self government         30
Counts_men women               28
Counts_years ago               26
Counts_general government      24
Counts_constitution united     20
Counts_government people       19
dtype: int64

In [39]:
trigrams_model = CountVectorizer(max_features=100, stop_words='english', ngram_range=(3,3))

trigrams = trigrams_model.fit_transform(speech_df['text'])

In [40]:
trigrams_df = pd.DataFrame(trigrams.toarray(), columns=trigrams_model.get_feature_names()).add_prefix('Counts_')

# most used trigrams
trigrams_df.sum().sort_values(ascending=False).head(10)

Counts_constitution united states     20
Counts_people united states           13
Counts_preserve protect defend        10
Counts_mr chief justice               10
Counts_president united states         8
Counts_defend constitution united      7
Counts_civil religious liberty         6
Counts_protect defend constitution     6
Counts_thank god bless                 6
Counts_united states america           6
dtype: int64