In [443]:
# standard libraries
import sys
import time
import re

# external libraries
import pandas as pd
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer

# custom libraries
sys.path.append("../")
from YouReader.Reader import Reader



# pandas settings
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)
pd.set_option('max_rows', None)

# Constants
TOKEN_PATTERN = r"[^\s]+"

## Loading Data from Save

In [445]:
reader = Reader()
count = reader.load_captions("../data/dataset.json")
df = reader.to_dataframe()
total_sums = df.groupby("subject")["link"].count()

print("Loaded", count, "captions from dataset.json")
print("Data Distribution:", total_sums)

Loaded 2600 captions from dataset.json
Data Distribution: subject
BIOL    200
BUS     200
CHE     200
CHEM    200
CS      200
ECON    200
ENGL    200
HIST    200
MATH    200
PHIL    200
PHYS    200
POSC    200
PSYC    200
Name: link, dtype: int64


<h2>Optional to use an even Dataset (do not run generally)

In [62]:
""""
# erase any unneeded
erase_keys = [key for key in reader.data if reader.data[key]["clean"] == ""]
for key in erase_keys:
    del reader.data[key]
    
# erase keys after the first 200 for each category 
subjects = df["subject"].unique()
dataset = {}
for subject in subjects:
    dataset[subject] = [key for key in reader.data if reader.data[key]["subject"] == subject]
    for key in dataset[subject][200:]:
        del reader.data[key]
"""

'"\n# erase any unneeded\nerase_keys = [key for key in reader.data if reader.data[key]["clean"] == ""]\nfor key in erase_keys:\n    del reader.data[key]\n    \n# erase keys after the first 200 for each category \nsubjects = df["subject"].unique()\ndataset = {}\nfor subject in subjects:\n    dataset[subject] = [key for key in reader.data if reader.data[key]["subject"] == subject]\n    for key in dataset[subject][200:]:\n        del reader.data[key]\n'

## Preparation for Bag of Words Model

In [462]:
# count word frequency of documents based on the subject
def word_frequency_by_subject(subject: str, vectorizer: CountVectorizer) -> pd.DataFrame:
    
    # generates data and labels for subject
    data = df.loc[df["subject"] == subject]["clean"]
    labels = df.loc[df["subject"] == subject]["subject"]
    
    # fits data for vector (generates features)
    data_fitter = vectorizer.fit(data)
    
    # generates vocabulary
    vocab = data_fitter.vocabulary_
    inv_vocab = {v: k for k,v in vocab.items()}
    
    # transform data into a matrix
    data_transform = data_fitter.transform(data)
    data_matrix = data_transform.toarray()
    data_df = pd.DataFrame(data_matrix)
    data_df.rename(columns = inv_vocab, inplace=True)
    
    # get frequency count
    data_sum = data_df.sum(axis=0)
    data_sum.sort_values(inplace=True, ascending=False)
    
    return data_sum

In [467]:
start_time = time.time()

word_frequency = {}
subjects = df["subject"].unique()
vectorizer = CountVectorizer(stop_words="english", token_pattern=TOKEN_PATTERN)

for subject in subjects:
    word_frequency[subject] = word_frequency_by_subject(subject, vectorizer)
    
print("This takes:", "{:.2f}".format(time.time() - start_time), "seconds")

This took: 37.79 seconds


<h1>Exploratory Data Analysis: Overview</h1>
<ol>
    <li>Average Word Length</li>
    <li>Most Diverse Vocabulary</li>
    <li>Most Filler Words</li>
    <li>Most Explicit Words</li>
    <li>First Digit Distribution</li>
    <li>Most Numbers</li>
    <li>Most Dates</li>
</ol>

<h3>1. Average Word Length</h3>

In [477]:
# returns the average word length for a frequency counter
def get_average_word_length(freq_counts: pd.Series) -> int:
    keys = freq_counts.keys()
    word_count = sum([len(key) for key in keys] * freq_counts) / freq_counts.sum()
    return word_count

In [479]:
word_length_table = {}

for subject in subjects:
    word_length_table[subject] = get_average_word_length(word_frequency[subject])
    
word_length_df = pd.DataFrame.from_dict(word_length_table, orient="index")
word_length_df

Unnamed: 0,0
BIOL,6.318926
BUS,6.400382
CHEM,5.978319
CS,5.71272
ECON,5.944188
ENGL,6.151897
HIST,6.359022
MATH,5.274696
PHIL,6.47245
PHYS,5.660883


<h3>2. Most Diverse Vocabulary</h3>

In [480]:
# returns length of frequency series
def get_diversity_count(freq_counts: pd.Series) -> int:
    return len(freq_counts)

In [481]:
diversity_count_table = {}

for subject in subjects:
    diversity_count_table[subject] = get_diversity_count(word_frequency[subject])

diversity_count_df = pd.DataFrame.from_dict(diversity_count_table, orient='index')
diversity_count_df

Unnamed: 0,0
BIOL,24559
BUS,27001
CHEM,20232
CS,21020
ECON,22478
ENGL,39740
HIST,42916
MATH,18282
PHIL,26885
PHYS,18658


<h3>3. Most Filler Words</h3>

In [484]:
with open("../data/analysis/filler.txt") as inFile:
    filler_words = inFile.read().split("\n")

# returns frequencies of filler words from facebook filter list
def get_filler_freq(freq_counts: pd.Series) -> dict:
    freq_counts_dict = freq_counts.to_dict()
    filler_measure = {k:v for k,v in freq_counts_dict.items() if k in filler_words}
    return filler_measure

In [487]:
filler_freq_table = {}
filler_sum_table = {}

for subject in subjects:
    filler_freq_table[subject] = get_filler_freq(word_frequency[subject])
    filler_sum_table[subject] = sum(filler_freq_table[subject].values())
    
filler_count_df = pd.DataFrame.from_dict(filler_sum_table, orient='index')
filler_count_df

Unnamed: 0,0
BIOL,5194
BUS,3614
CHEM,7930
CS,18196
ECON,8194
ENGL,3626
HIST,1971
MATH,9111
PHIL,2425
PHYS,4813


<h3>4. Most Explicit Words</h3>

In [489]:
with open("../data/analysis/swear.txt") as inFile:
    swear_words = inFile.read().split("\n")
    
# returns frequencies of swear words from facebook filter list
def get_swear_freq(freq_counts: pd.Series) -> dict:
    freq_counts_dict = freq_counts.to_dict()
    swear_measure = {k:v for k,v in freq_counts_dict.items() if k in swear_words}
    return swear_measure

In [490]:
swear_freq_table = {}
swear_sum_table = {}

for subject in subjects:
    swear_freq_table[subject] = get_swear_freq(word_frequency[subject])
    swear_sum_table[subject] = sum(swear_freq_table[subject].values())

swear_count_df = pd.DataFrame.from_dict(swear_sum_table, orient='index')
swear_count_df

Unnamed: 0,0
BIOL,708
BUS,162
CHEM,254
CS,152
ECON,151
ENGL,1028
HIST,845
MATH,197
PHIL,429
PHYS,229


<h3>5. First Digit Distribution</h3>

In [522]:
digit_regexes = [r"\b0.+", r"\b1.+", r"\b2.+", r"\b3.+", r"\b4.+", r"\b5.+", r"\b6.+", r"\b7.+", r"\b8.+", r"\b9.+"]
first_digit_columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, "total", "subject"]

# total counts for starting digits 0-9
def get_first_digits_freq(freq_counts: pd.Series, subject: str) -> dict:
    digit_freq = []
    digit_sums = {}
    keys = freq_counts.keys()
    
    for i in range(len(digit_regexes)):
        digit_freq = [freq_counts[key] for key in keys if re.match(digit_regexes[i], key)]
        digit_sums[i] = sum(digit_freq)
    
    digit_sums["total"] = sum(digit_sums.values())
    digit_sums["subject"] = subject
    return digit_sums

In [523]:
first_digits_freq_table = {}
first_digits_df = pd.DataFrame(columns=first_digit_columns)

# first digits
for subject in subjects:
    
    first_digits_freq_table[subject] = get_first_digits_freq(word_frequency[subject], subject)  
    first_digits_df = first_digits_df.append(first_digits_freq_table[subject], ignore_index=True)

first_digits_df.set_index("subject", inplace=True)
first_digits_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,total
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BIOL,149,1392,801,443,276,347,182,118,115,146,3969
BUS,124,2027,1316,617,381,516,219,197,248,270,5915
CHEM,183,2401,1058,538,209,345,132,143,92,212,5313
CS,392,2442,941,475,315,404,403,143,227,166,5908
ECON,378,3732,1864,647,484,616,350,257,328,337,8993
ENGL,3,1751,646,203,196,131,104,78,61,95,3268
HIST,55,5184,663,440,458,285,257,194,188,128,7852
MATH,293,3463,2052,1081,649,506,366,191,218,247,9066
PHIL,73,1017,408,188,146,128,109,80,69,95,2313
PHYS,189,2249,1517,436,433,201,90,52,96,156,5419


<h3>6. Most Numbers</h3>

<h3>7. Most Dates</h3>

In [526]:
date_pattern = r"\b[0-9]{3,4}s\b|\b[0-9]{4}\b"

def get_date_freq(freq_counts: pd.Series) -> dict:
    freq_counts_dict = freq_counts.to_dict()
    date_measure = {k:v for k,v in freq_counts_dict.items() if re.match(date_pattern, k)}
    return date_measure


In [529]:
date_freq_table = {}
date_sum_table = {}

# most dates
for subject in subjects:
    date_freq_table[subject] = get_date_freq(word_frequency[subject])
    date_sum_table[subject] = sum(date_freq_table[subject].values())
    
date_count_df = pd.DataFrame.from_dict(date_sum_table, orient='index')
date_count_df

Unnamed: 0,0
BIOL,432
BUS,879
CHEM,475
CS,462
ECON,1977
ENGL,895
HIST,4311
MATH,286
PHIL,377
PHYS,187
