In [49]:
# imports
import sys
import pandas as pd
sys.path.append("../")
from YouReader.Reader import Reader
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from pprint import pprint
import time

# pandas settings
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)
pd.set_option('max_rows', None)

In [50]:
# Constants
TOKEN_PATTERN = r"[^\s]+"

## Loading Data from Save

In [51]:
start_time = time.time()


reader = Reader()
count = reader.load_captions("../data/save.json")
df = reader.to_dataframe()


print("Loaded", count, "captions from save.json")
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

Loaded 1151 captions from save.json
This took 1.33 seconds


In [52]:
# prints labels and counts in dataset
df.groupby("subject")["link"].count()

subject
BIOL      82
BUS      135
CS       239
ENGL      65
MATH     269
POLI     137
PSYCH     70
Name: link, dtype: int64

## Preparation for Bag of Words Model

In [60]:
start_time = time.time()


transcripts = df[["clean", "subject"]]
vectorizer0 = CountVectorizer(token_pattern=TOKEN_PATTERN)
vectorizer1 = CountVectorizer(stop_words="english", token_pattern=TOKEN_PATTERN)
vectorizer3 = CountVectorizer(stop_words="english")


print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 0.00 seconds


## Subject: BIOL

In [61]:
start_time = time.time()


# generates the data and partitions for BIOL
BIOL_data = transcripts.loc[transcripts["subject"] == "BIOL"]["clean"]
BIOL_labels = transcripts.loc[transcripts["subject"] == "BIOL"]["subject"]

# fits the data based on type of vectorize
BIOL_fit = vectorizer1.fit(BIOL_data)
#BIOL_fit = vectorizer2.fit(BIOL_data)
#BIOL_fit = vectorizer3.fit(BIOL_data)

# generates vocabulary
BIOL_vocab = BIOL_fit.vocabulary_
BIOL_inv_vocab = {v: k for k,v in BIOL_vocab.items()}

# transforms data, and transitions to dataframe
BIOL_transform = BIOL_fit.transform(BIOL_data)
BIOL_matrix = BIOL_transform.toarray()
BIOL_df = pd.DataFrame(BIOL_matrix)
BIOL_df.rename(columns = BIOL_inv_vocab, inplace=True)


print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 0.94 seconds


In [62]:
# grabs the word sums
BIOL_sum = BIOL_df.sum(axis=0)
BIOL_sum.sort_values(inplace=True, ascending=False)
BIOL_sum

it's                     3435
right                    3058
that's                   2498
okay                     2179
just                     2112
cells                    2001
cell                     1878
going                    1761
called                   1643
like                     1412
there's                  1395
know                     1307
blood                    1299
dna                      1011
body                      957
i'm                       940
really                    913
we're                     911
they're                   887
look                      873
want                      872
you're                    870
let's                     862
muscle                    853
different                 841
don't                     840
say                       782
page                      747
means                     696
what's                    685
actually                  679
protein                   660
make                      645
little    

## Subject: BUS

In [63]:
start_time = time.time()

# generates the data and partitions for BUS
BUS_data = transcripts.loc[transcripts["subject"] == "BUS"]["clean"]
BUS_labels = transcripts.loc[transcripts["subject"] == "BUS"]["subject"]

# fits the data based on type of vectorize

BUS_fit = vectorizer1.fit(BUS_data)
#BUS_fit = vectorizer2.fit(BUS_data)
#BUS_fit = vectorizer3.fit(BUS_data)

# generates vocabulary
BUS_vocab = BUS_fit.vocabulary_
BUS_inv_vocab = {v: k for k,v in BUS_vocab.items()}


# transforms data, and transitions to dataframe
BUS_transform = BUS_fit.transform(BUS_data)
BUS_matrix = BUS_transform.toarray()
BUS_df = pd.DataFrame(BUS_matrix)
BUS_df.rename(columns = BUS_inv_vocab, inplace=True)

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")


This took 2.58 seconds


In [64]:
# grabs the word sums
BUS_sum = BUS_df.sum(axis=0)
BUS_sum.sort_values(inplace=True, ascending=False)
BUS_sum


it's                               7916
going                              7211
that's                             5250
just                               4417
people                             4018
think                              3743
right                              3587
you're                             3580
know                               3539
like                               3353
i'm                                3349
don't                              3288
company                            3013
say                                2667
money                              2662
want                               2646
market                             2492
there's                            2345
rate                               2340
time                               2302
percent                            2270
1                                  2234
price                              2227
make                               2203
they're                            2199


## Subject: CS

In [54]:
start_time = time.time()


# generates the data and partitions for CS
CS_data = transcripts.loc[transcripts["subject"] == "CS"]["clean"]
CS_labels = transcripts.loc[transcripts["subject"] == "CS"]["subject"]

# fits the data based on type of vectorize
CS_fit = vectorizer1.fit(CS_data)
#CS_fit = vectorizer2.fit(CS_data)
#CS_fit = vectorizer3.fit(CS_data)

# generates vocabulary
CS_vocab = CS_fit.vocabulary_
CS_inv_vocab = {v: k for k,v in CS_vocab.items()}

# transforms data, and transitions to dataframe
CS_transform = CS_fit.transform(CS_data)
CS_matrix = CS_transform.toarray()
CS_df = pd.DataFrame(CS_matrix)
CS_df.rename(columns = CS_inv_vocab, inplace=True)


print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 1.21 seconds


In [55]:
# grabs the word sums
CS_sum = CS_df.sum(axis=0)
CS_sum.sort_values(inplace=True, ascending=False)
CS_sum

going                    5867
it's                     5461
right                    4506
that's                   3743
just                     3426
i'm                      3246
like                     2888
n                        2744
time                     2652
want                     2577
know                     2413
we're                    2362
you're                   2125
way                      2073
1                        1987
there's                  1975
don't                    1948
let's                    1924
ok                       1828
okay                     1772
machine                  1737
say                      1711
problem                  1585
look                     1529
think                    1490
number                   1446
really                   1431
make                     1392
thing                    1279
things                   1253
does                     1227
yeah                     1213
algorithm                1196
order     

## Subject: ENGL

In [65]:
start_time = time.time()

# generates the data and partitions for ENGL
ENGL_data = transcripts.loc[transcripts["subject"] == "ENGL"]["clean"]
ENGL_labels = transcripts.loc[transcripts["subject"] == "ENGL"]["subject"]

# fits the data based on type of vectorize

ENGL_fit = vectorizer1.fit(ENGL_data)
#ENGL_fit = vectorizer2.fit(ENGL_data)
#ENGL_fit = vectorizer3.fit(ENGL_data)

# generates vocabulary
ENGL_vocab = ENGL_fit.vocabulary_
ENGL_inv_vocab = {v: k for k,v in ENGL_vocab.items()}


# transforms data, and transitions to dataframe
ENGL_transform = ENGL_fit.transform(ENGL_data)
ENGL_matrix = ENGL_transform.toarray()
ENGL_df = pd.DataFrame(ENGL_matrix)
ENGL_df.rename(columns = ENGL_inv_vocab, inplace=True)

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 1.15 seconds


In [66]:
# grabs the word sums
ENGL_sum = ENGL_df.sum(axis=0)
ENGL_sum.sort_values(inplace=True, ascending=False)
ENGL_sum

it's                       3447
know                       2292
like                       2282
say                        2105
going                      1893
way                        1834
just                       1765
think                      1478
kind                       1445
things                     1414
that's                     1336
play                       1305
right                      1268
really                     1225
people                     1204
time                       1177
actually                   1149
want                       1138
writing                    1052
i'm                        1045
don't                      1035
there's                    1024
mean                       1023
words                       958
make                        952
you're                      948
he's                        931
thing                       901
says                        892
character                   858
story                       848
does    

## Subject: MATH

In [67]:
start_time = time.time()

# generates the data and partitions for MATH
MATH_data = transcripts.loc[transcripts["subject"] == "MATH"]["clean"]
MATH_labels = transcripts.loc[transcripts["subject"] == "MATH"]["subject"]

# fits the data based on type of vectorize

MATH_fit = vectorizer1.fit(MATH_data)
#MATH_fit = vectorizer2.fit(MATH_data)
#MATH_fit = vectorizer3.fit(MATH_data)

# generates vocabulary
MATH_vocab = MATH_fit.vocabulary_
MATH_inv_vocab = {v: k for k,v in MATH_vocab.items()}


# transforms data, and transitions to dataframe
MATH_transform = MATH_fit.transform(MATH_data)
MATH_matrix = MATH_transform.toarray()
MATH_df = pd.DataFrame(MATH_matrix)
MATH_df.rename(columns = MATH_inv_vocab, inplace=True)

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 3.64 seconds


In [68]:
# grabs the word sums
MATH_sum = MATH_df.sum(axis=0)
MATH_sum.sort_values(inplace=True, ascending=False)
MATH_sum

x                         21557
going                     16629
that's                    12292
it's                      11754
right                     11207
just                      10589
1                         10252
negative                   9073
okay                       8668
plus                       8582
minus                      8399
i'm                        8038
like                       7734
we're                      7596
2                          6986
squared                    6856
know                       6455
times                      6103
want                       6096
y                          5863
let's                      5745
0                          5186
point                      5083
function                   5010
equals                     4548
don't                      4504
3                          4399
zero                       4261
you're                     4248
way                        4186
square                     4108
root    

## Subject: POLI

In [69]:
start_time = time.time()

# generates the data and partitions for POLI
POLI_data = transcripts.loc[transcripts["subject"] == "POLI"]["clean"]
POLI_labels = transcripts.loc[transcripts["subject"] == "POLI"]["subject"]

# fits the data based on type of vectorize

POLI_fit = vectorizer1.fit(POLI_data)
#POLI_fit = vectorizer2.fit(POLI_data)
#POLI_fit = vectorizer3.fit(POLI_data)

# generates vocabulary
POLI_vocab = POLI_fit.vocabulary_
POLI_inv_vocab = {v: k for k,v in POLI_vocab.items()}


# transforms data, and transitions to dataframe
POLI_transform = POLI_fit.transform(POLI_data)
POLI_matrix = POLI_transform.toarray()
POLI_df = pd.DataFrame(POLI_matrix)
POLI_df.rename(columns = POLI_inv_vocab, inplace=True)

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 2.45 seconds


In [70]:
# grabs the word sums
POLI_sum = POLI_df.sum(axis=0)
POLI_sum.sort_values(inplace=True, ascending=False)
POLI_sum

it's                              3268
right                             3215
people                            3198
think                             2716
going                             2062
just                              2046
say                               1943
like                              1851
that's                            1753
way                               1674
know                              1604
really                            1536
said                              1515
okay                              1468
want                              1345
don't                             1339
time                              1329
different                         1320
kind                              1303
idea                              1143
political                         1109
good                              1094
actually                          1052
government                        1002
important                          996
-                        

## Subject: PSYCH

In [58]:
start_time = time.time()


# generates the data and partitions for PSYCH
PSYCH_data = transcripts.loc[transcripts["subject"] == "PSYCH"]["clean"]
PSYCH_labels = transcripts.loc[transcripts["subject"] == "PSYCH"]["subject"]

# fits the data based on type of vectorize
PSYC_fit = vectorizer1.fit(PSYCH_data)
#PSYCH_fit = vectorizer2.fit(PSYCH_data)
#PSYCH_fit = vectorizer3.fit(PSYCH_data)

# generates vocabulary
PSYCH_vocab = PSYCH_fit.vocabulary_
PSYCH_inv_vocab = {v: k for k,v in PSYCH_vocab.items()}

# transforms data, and transitions to dataframe
PSYCH_transform = PSYCH_fit.transform(PSYCH_data)
PSYCH_matrix = PSYCH_transform.toarray()
PSYCH_df = pd.DataFrame(PSYCH_matrix)
PSYCH_df.rename(columns = PSYCH_inv_vocab, inplace=True)a


print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 1.31 seconds


In [59]:
# grabs the word sums
PSYC_sum = PSYC_df.sum(axis=0)
PSYC_sum.sort_values(inplace=True, ascending=False)
PSYC_sum

it's                           3428
people                         3144
like                           2131
just                           1936
you're                         1655
things                         1653
there's                        1627
that's                         1620
right                          1611
really                         1594
know                           1590
going                          1472
think                          1359
don't                          1310
they're                        1218
job                            1094
here's                         1041
way                            1000
say                             988
different                       987
brain                           971
make                            900
good                            878
kind                            875
want                            862
i'm                             854
work                            852
person                      