#### Sociology 128D: Mining Culture Through Text Data: Introduction to Social Data Science

# Notebook 3: Stylometry

In this notebook, we're going to take our first step toward vector semantics, which is one of the main approaches we'll use in this class and which has had an enormous influence in cultural sociology! Specifically, we are going to build on Notebook 2 by using word and document frequencies to visualize how similar or dissimilar documents are.

Please download the [State of the Union Corpus (1790-2018)](https://www.kaggle.com/rtatman/state-of-the-union-corpus-1989-2017), which was posted to Kaggle by Rachael Tatman and Liling Tan. 

In [None]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

from collections import Counter
from scipy.stats import pearsonr, spearmanr
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

sns.set_theme(style="darkgrid")

In [None]:
sorted(os.listdir("sotu"))

In [None]:
address_paths = [os.path.join("sotu", f) for f in os.listdir("sotu") if f.endswith(".txt")]

In [None]:
print(open(address_paths[0], "r").read())

In [None]:
def return_sotu_name_year_text(f: str):
    """Return the name, year, and text of a SOTU."""
    doc = open(f, "r").read().strip()
    f = os.path.split(f)[-1] # this 
    f = f.replace(".txt", "")
    pres, year = f.split("_")
    return pres, year, doc

In [None]:
return_sotu_name_year_text(address_paths[0])

In [None]:
presidents = []
years = []
docs = []

for path in address_paths:
    pres, year, doc = return_sotu_name_year_text(path)
    presidents.append(pres)
    years.append(year)
    docs.append(doc)
    
data = list(zip(presidents, years, docs))

pd.DataFrame(data, columns = ["president", "year", "text"]).head()

In [None]:
df = pd.DataFrame(address_paths, columns = ["file_path"])
df[["president", "year", "text"]] = df.file_path.apply(lambda x: pd.Series(return_sotu_name_year_text(x)))
df.drop(columns = ["file_path"], inplace = True)

In [None]:
df.sort_values(by="year", inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
df.head()

In [None]:
df.drop(index=0, inplace = True)

In [None]:
df.head()

In [None]:
df[df.president=="Adams"]

In [None]:
df.year = df.year.apply(int)

In [None]:
df.president = np.where(df.president.eq("Adams") & df["year"].gt(1800), "Adams2", df.president)

In [None]:
df[df.president=="Adams"]

In [None]:
df[df.president=="Adams2"]

In [None]:
df.president = np.where(df.president.eq("Bush") & df["year"].gt(2000), "Bush2", df.president)
df.president = np.where(df.president.eq("Johnson") & df["year"].gt(1900), "Johnson2", df.president)
df.president = np.where(df.president.eq("Roosevelt") & df["year"].gt(1930), "Roosevelt2", df.president)

In [None]:
df.president.unique()

In [None]:
len(df.president.unique())

In [None]:
df.text = df.text.apply(str.lower)

In [None]:
df.head()

In [None]:
?ord

In [None]:
print(f'a = {ord("a")}, z = {ord("z")}, and space = {ord(" ")}')

In [None]:
s = "This is a test string, and it has some punctuation--not a lot, but some--that we're going to remove."

s2 = ""
for char in s.lower():
    if char == " " or ord(char) in range(97,123):
        s2 += char
    else:
        s2 += " "
        
s2

In [None]:
def keep_alphabetical(text: str) -> str:
    """Keep only lowercase a-z"""
    return "".join([char if (ord(char) in range(97,123) or char == " ") else " " for char in text])


df.text = df.text.apply(lambda x: keep_alphabetical(x))

In [None]:
df.head()

In [None]:
all_text = " ".join(df.text)

word_frequencies = dict(Counter(all_text.split()))

types_and_counts = sorted(list(word_frequencies.items()), reverse = True, key = lambda x: x[1])
print(types_and_counts[:100])

In [None]:
print(f"The corpus has {sum(word_frequencies.values()):,} words.") 

In [None]:
types_, token_counts = zip(*types_and_counts)

In [None]:
plt.figure(figsize=(14, 8))
plt.bar(x = range(100), height = token_counts[:100])
plt.title("Frequencies of Top 100 Terms in Corpus")
plt.show()

In [None]:
plt.figure(figsize=(14, 8))
plt.bar(x = types_[:20], height = token_counts[:20])
plt.xticks(rotation = 90)
plt.title("Frequencies of Top 20 Terms in Corpus")
plt.show()

In [None]:
log_rank = np.log(range(1, len(token_counts)+1))
log_frequencies = np.log(token_counts)

plt.figure(figsize=(14, 8))
plt.plot(log_rank, log_frequencies)
plt.ylabel("ln(word frequency)")
plt.xlabel("ln(word rank)")
plt.title("Word Rank versus Frequency (log-log)")
plt.show()

In [None]:
def set_of_types(document: str) -> str:
    return " ".join(list(set(document.split())))

In [None]:
s = "this is a string that repeats some words, like string and words and some"

print(Counter(s.split())) # three types occur twice

In [None]:
s2 = set_of_types(s)

print(Counter(s2.split())) # each type occurs only once

In [None]:
df["types"] = df.text.apply(set_of_types)

In [None]:
df.head()

In [None]:
document_frequencies = dict(Counter(" ".join(df.types).split()))

In [None]:
df.drop(columns=["types"], inplace=True)

In [None]:
vocabulary = sorted(list(word_frequencies.keys()))

x = [word_frequencies[word] for word in vocabulary]
y = [document_frequencies[word] for word in vocabulary]

print("Correlation between each word's frequency in the overall corpus and its document frequency:")
print(f"Pearson's correlation coefficient: {pearsonr(x, y)[0]:.2f}")
print(f"Spearman's rank-order correlation: {spearmanr(x, y)[0]:.2f}")

In [None]:
print(len(vocabulary))

If we are interested in analyzing meaning from a corpus, in practice we will often remove words that appear only once or in only one document (which aren't the same thing!). We sometimes call these [hapaxes](https://en.wikipedia.org/wiki/Hapax_legomenon). We can't say that two documents have a word in common if only one document in the entire corpus has the word!

In [None]:
hapaxes = [word for word in vocabulary if document_frequencies[word] == 1]
print(len(hapaxes))

We may often exclude words that appear in *every* document for similar reasons.

Let's remove hapaxes.

In [None]:
word_frequencies = {key:value for key, value in word_frequencies.items() if key not in hapaxes}
document_frequencies = {key:value for key, value in document_frequencies.items() if key not in hapaxes}

assert word_frequencies.keys() == document_frequencies.keys()

types_and_counts = sorted(list(word_frequencies.items()), reverse = True, key = lambda x: x[1])
vocabulary, _ = zip(*types_and_counts)

In [None]:
print(len(vocabulary))

In [None]:
df["speech_title"] = df.apply(lambda row: row["president"].lower() + "_" + str(row["year"]), axis = 1)
df["wordcount"] = df.text.apply(lambda x: len(x.split()))

df.head()

In [None]:
plt.figure(figsize=(14, 8))
sns.scatterplot(x = "year", y = "wordcount", data = df)
plt.title("Wordcount of State of the Union Address by Year")
plt.xlabel("Year")
plt.ylabel("Words")
plt.plot()

In [None]:
df.wordcount.max()

In [None]:
df[df.wordcount.eq(df.wordcount.max())]

## Document-Term Matrix

In [None]:
dtm = copy.copy(df)
dtm.text = dtm.text.apply(str.split)
dtm = dtm[["speech_title", "text"]]
dtm.head()

In [None]:
def term_frequency(doc, vocab):
    return [doc.count(term) for term in vocab]

In [None]:
s = ["the", "cat", "in", "the", "hat"]

term_frequency(s, vocabulary[:10])

In [None]:
for idx, row in dtm.iterrows():
    print(vocabulary[:10])
    print(term_frequency(row.text, vocabulary[:10]))
    break

In [None]:
sub_voc = vocabulary[:3000]

dtm[list(sub_voc)] = dtm.text.apply(lambda x: pd.Series(term_frequency(x, sub_voc))) # this takes a moment

In [None]:
dtm.head()

In [None]:
dtm.drop(columns="text", inplace=True)
dtm.set_index("speech_title", inplace=True)

In [None]:
dtm.head()

In [None]:
dtm.shape

## Plotting Speeches in a 2D Space using Principal Component Analysis

In [None]:
dtm_std = copy.copy(dtm)
titles = dtm_std.index
dtm_std = dtm_std.to_numpy()

sd = np.std(dtm.to_numpy(), ddof = 1, axis = None)

dtm_std = dtm_std - dtm_std.mean()
dtm_std = dtm_std/sd

In [None]:
dtm_std

In [None]:
dtm_std.mean()

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(dtm_std)

pca_df = pd.DataFrame(data = components, columns = ["orig_component1", "orig_component2"])

In [None]:
pca_df["title"] = titles
pca_df[["president", "year"]] = pca_df.title.apply(lambda x: pd.Series(x.split("_")))
pca_df.year = pca_df.year.apply(int)
pca_df

In [None]:
mask = pca_df["year"] > 2000

label_points = False

plt.figure(figsize=(14, 8))
sns_plot = sns.scatterplot(x = "orig_component1", y = "orig_component2", data = pca_df[mask], hue="president")
plt.title("Distribution of Speeches According to First Two Components")
if label_points:
    for idx, row in pca_df[mask].iterrows():
        sns_plot.text(x = row["orig_component1"], y = row["orig_component2"], s = row["title"])
plt.show()

In [None]:
def return_decade(year):
    return str(year)[:-1] + "0s"

In [None]:
return_decade(1990)

In [None]:
pca_df["decade"] = pca_df.year.apply(return_decade)

In [None]:
pca_df.head()

In [None]:
plt.figure(figsize=(14, 8))
sns.scatterplot(x = "orig_component1", y = "orig_component2", data = pca_df, hue="decade")
plt.title("Distribution of State of the Union Addresses\nAccording to First Two Components")
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

## Using TF-IDF to Compare Documents

Let's see if things improve if we use [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) weighting.

In [None]:
dtm.head()

In [None]:
def return_idf(N: int, df: int) -> float:
    return np.log10(N/(1 + df))


def tfidf_ind(doc: str, word: str) -> float:
    tf = np.log(1 + doc.count(word))
    idf = idf_dict[word]
    return tf * idf


def tfidf_vocab(doc: str, vocab: list) -> list:
    return [tfidf_ind(doc, word) for word in vocab]
    
N = dtm.shape[0]
    
idf_dict = {word: return_idf(N, frequency) for word, frequency in document_frequencies.items()}

In [None]:
print([key for key, value in idf_dict.items() if value == 0])

In [None]:
tfidf_mat = copy.copy(df)
tfidf_mat.text = tfidf_mat.text.apply(str.split)
tfidf_mat = tfidf_mat[["speech_title", "text"]]
tfidf_mat[list(sub_voc)] = tfidf_mat.text.apply(lambda x: pd.Series(tfidf_vocab(x, sub_voc))) # this takes a moment
tfidf_mat.drop(columns="text", inplace=True)
tfidf_mat.head()

In [None]:
tfidf_mat.set_index("speech_title", inplace=True)
titles = tfidf_mat.index
tfidf_mat = tfidf_mat.to_numpy()

sd = np.std(tfidf_mat, ddof = 1, axis = None)

tfidf_mat = tfidf_mat - tfidf_mat.mean()
tfidf_mat = tfidf_mat/sd

In [None]:
tfidf_pca = PCA(n_components=2)
components = tfidf_pca.fit_transform(tfidf_mat)

tfidf_pca_df = pd.DataFrame(data = components, columns = ["tfidf_component1", "tfidf_component2"])
tfidf_pca_df["title"] = titles
tfidf_pca_df[["president", "year"]] = tfidf_pca_df.title.apply(lambda x: pd.Series(x.split("_")))
tfidf_pca_df.year = tfidf_pca_df.year.apply(int)
tfidf_pca_df

In [None]:
mask = tfidf_pca_df["year"] > 2000
tfidf_pca_df[mask]

label_points = False

plt.figure(figsize=(14, 8))
sns_plot = sns.scatterplot(x = "tfidf_component1", y = "tfidf_component2", data = tfidf_pca_df[mask], hue="president")
plt.title("Distribution of State of the Union Addresses\nAccording to First Two Components")
if label_points:
    for idx, row in tfidf_pca_df[mask].iterrows():
        sns_plot.text(x = row["tfidf_component1"], y = row["tfidf_component2"], s = row["title"])
plt.show()

In [None]:
tfidf_pca_df["decade"] = tfidf_pca_df.year.apply(return_decade)

In [None]:
tfidf_pca_df

In [None]:
plt.figure(figsize=(14, 8))
sns.scatterplot(x = "tfidf_component1", y = "tfidf_component2", data = tfidf_pca_df, hue="decade")
plt.title("Distribution of State of the Union Addresses\nAccording to First Two Components")
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

In [None]:
mask = tfidf_pca_df.decade.isin(["1790s", "1890s", "1990s"])

label_points = False

plt.figure(figsize=(14, 8))
sns_plot = sns.scatterplot(x = "tfidf_component1", y = "tfidf_component2", data = tfidf_pca_df[mask], hue="decade")
plt.title("Distribution of State of the Union Addresses\nAccording to First Two Components")
plt.legend(bbox_to_anchor=(1.25, 1))
if label_points:
    for idx, row in tfidf_pca_df[mask].iterrows():
        sns_plot.text(x = row["tfidf_component1"], y = row["tfidf_component2"], s = row["title"])
plt.show()

## Sparse versus Dense Vectors

In [None]:
print(f"Number of non-zero values in (truncated) document-term matrix: {np.count_nonzero(dtm)}")
print(f"Number of entries in (truncated) document-term matrix: {dtm.size}")
print(f"{np.count_nonzero(dtm)/dtm.size * 100:.0f}% of entries are zeros, and that's based on "
      "the 3,000 most frequent words.")