README

This Python file includes code that finds the TF-IDF of five different texts. The first four texts are used for comparison purposes in my project, while the last one answers an additional open-ended question. Each cell's output is a table with the relevance of terms in each specific ebook. This project incorporates Pandas and regex methods to analyze the authors' word choices in their texts.

In [59]:
import re
import pandas as pd
import numpy as np
import math

with open("tesla.txt", "r", encoding="utf-8") as file:
    content = file.read()

content = content.lower()
content = re.sub(r'[^\w\s]', '', content)

patterns = [
    r'\b(?:license|copyright|disclaimer|terms|conditions|legal|acknowledgments)\b',
    r'\b(?:table of contents|contents|index)\b',
    r'\b(?:author|editor|publisher|contributor|credits)\b',
    r'\b(?:dedication|foreword|preface|introduction)\b',
    r'\b(?:epilogue|afterword|appendix)\b'
]
combined_pattern = '|'.join(patterns)
content = re.sub(combined_pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\n\s*\n', '\n\n', content).strip()
words = content.split()

counts = {}
for word in words:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf = {word: count / len(words) for word, count in counts.items()}
idf = {word: math.log10(1/(1+1)) for word in counts}
tfidf = {word: tf[word] * idf[word] for word in tf}

df = pd.DataFrame(list(tf.items()), columns=['Word', 'TF'])
df2 = pd.DataFrame(list(idf.items()), columns= ['Word', 'IDF'])
df3 = pd.DataFrame(list(tfidf.items()), columns= ['Word', 'TF-IDF'])


dfsort = df.sort_values(by='TF', ascending=True).reset_index(drop=True)
df2sort = df2.sort_values(by='IDF', ascending=True).reset_index(drop=True)

final = df3.sort_values(by='TF-IDF', ascending=True).reset_index(drop=True)
print(final)



            Word    TF-IDF
0            the -0.028723
1             of -0.014440
2              a -0.008327
3             in -0.007807
4             to -0.007477
...          ...       ...
7361     usedmay -0.000002
7362       fault -0.000002
7363    wrapping -0.000002
7364       clean -0.000002
7365  newsletter -0.000002

[7366 rows x 2 columns]


In [89]:
import re
import pandas as pd
import numpy as np
import math

with open("inventors.txt", "r", encoding="utf-8") as file:
    content = file.read()

content = content.lower()
content = re.sub(r'[^\w\s]', '', content)

patterns = [
    r'\b(?:license|copyright|disclaimer|terms|conditions|legal|acknowledgments)\b',
    r'\b(?:table of contents|contents|index)\b',
    r'\b(?:author|editor|publisher|contributor|credits)\b',
    r'\b(?:dedication|foreword|preface|introduction)\b',
    r'\b(?:epilogue|afterword|appendix)\b'
]
combined_pattern = '|'.join(patterns)
content = re.sub(combined_pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\n\s*\n', '\n\n', content).strip()
words = content.split()

counts = {}
for word in words:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf = {word: count / len(words) for word, count in counts.items()}
idf = {word: math.log10(1/(1+1)) for word in counts}
tfidf = {word: tf[word] * idf[word] for word in tf}

df = pd.DataFrame(list(tf.items()), columns=['Word', 'TF'])
df2 = pd.DataFrame(list(idf.items()), columns= ['Word', 'IDF'])
df3 = pd.DataFrame(list(tfidf.items()), columns= ['Word', 'TF-IDF'])


dfsort = df.sort_values(by='TF', ascending=True).reset_index(drop=True)
df2sort = df2.sort_values(by='IDF', ascending=True).reset_index(drop=True)

final = df3.sort_values(by='TF-IDF', ascending=True).reset_index(drop=True)
print(final)





            Word    TF-IDF
0            the -0.017564
1            and -0.011232
2             of -0.010402
3             to -0.009424
4             he -0.008105
...          ...       ...
5399       dance -0.000009
5400        reel -0.000009
5401     curious -0.000009
5402    stubborn -0.000009
5403  newsletter -0.000009

[5404 rows x 2 columns]


In [87]:
import re
import pandas as pd
import numpy as np
import math

with open("football.txt", "r", encoding="utf-8") as file:
    content = file.read()

content = content.lower()
content = re.sub(r'[^\w\s]', '', content)

patterns = [
    r'\b(?:license|copyright|disclaimer|terms|conditions|legal|acknowledgments)\b',
    r'\b(?:table of contents|contents|index)\b',
    r'\b(?:author|editor|publisher|contributor|credits)\b',
    r'\b(?:dedication|foreword|preface|introduction)\b',
    r'\b(?:epilogue|afterword|appendix)\b'
]
combined_pattern = '|'.join(patterns)
content = re.sub(combined_pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\n\s*\n', '\n\n', content).strip()
words = content.split()

counts = {}
for word in words:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf = {word: count / len(words) for word, count in counts.items()}
idf = {word: math.log10(1/(1+1)) for word in counts}
tfidf = {word: tf[word] * idf[word] for word in tf}

df = pd.DataFrame(list(tf.items()), columns=['Word', 'TF'])
df2 = pd.DataFrame(list(idf.items()), columns= ['Word', 'IDF'])
df3 = pd.DataFrame(list(tfidf.items()), columns= ['Word', 'TF-IDF'])


dfsort = df.sort_values(by='TF', ascending=True).reset_index(drop=True)
df2sort = df2.sort_values(by='IDF', ascending=True).reset_index(drop=True)

final = df3.sort_values(by='TF-IDF', ascending=True).reset_index(drop=True)
print(final)





            Word    TF-IDF
0            the -0.022164
1             of -0.009991
2            and -0.008552
3             to -0.008250
4              a -0.007844
...          ...       ...
3626     finance -0.000012
3627     solvent -0.000012
3628   pertinent -0.000012
3629         300 -0.000012
3630  newsletter -0.000012

[3631 rows x 2 columns]


In [63]:
import re
import pandas as pd
import numpy as np
import math

with open("athletics.txt", "r", encoding="utf-8") as file:
    content = file.read()

content = content.lower()
content = re.sub(r'[^\w\s]', '', content)

patterns = [
    r'\b(?:license|copyright|disclaimer|terms|conditions|legal|acknowledgments)\b',
    r'\b(?:table of contents|contents|index)\b',
    r'\b(?:author|editor|publisher|contributor|credits)\b',
    r'\b(?:dedication|foreword|preface|introduction)\b',
    r'\b(?:epilogue|afterword|appendix)\b'
]
combined_pattern = '|'.join(patterns)
content = re.sub(combined_pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\n\s*\n', '\n\n', content).strip()
words = content.split()

counts = {}
for word in words:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf = {word: count / len(words) for word, count in counts.items()}
idf = {word: math.log10(1/(1+1)) for word in counts}
tfidf = {word: tf[word] * idf[word] for word in tf}

df = pd.DataFrame(list(tf.items()), columns=['Word', 'TF'])
df2 = pd.DataFrame(list(idf.items()), columns= ['Word', 'IDF'])
df3 = pd.DataFrame(list(tfidf.items()), columns= ['Word', 'TF-IDF'])


dfsort = df.sort_values(by='TF', ascending=True).reset_index(drop=True)
df2sort = df2.sort_values(by='IDF', ascending=True).reset_index(drop=True)

final = df3.sort_values(by='TF-IDF', ascending=True).reset_index(drop=True)
print(final)





            Word    TF-IDF
0            the -0.029075
1             of -0.008395
2             is -0.008104
3             to -0.007238
4              a -0.006908
...          ...       ...
4951    strideis -0.000006
4952     sideleg -0.000006
4953  resistance -0.000006
4954      streak -0.000006
4955        hear -0.000006

[4956 rows x 2 columns]


In [97]:
import re
import pandas as pd
import numpy as np
import math

with open("basketball.txt", "r", encoding="utf-8") as file:
    content = file.read()

content = content.lower()
content = re.sub(r'[^\w\s]', '', content)

patterns = [
    r'\b(?:license|copyright|disclaimer|terms|conditions|legal|acknowledgments)\b',
    r'\b(?:table of contents|contents|index)\b',
    r'\b(?:author|editor|publisher|contributor|credits)\b',
    r'\b(?:dedication|foreword|preface|introduction)\b',
    r'\b(?:epilogue|afterword|appendix)\b'
]
combined_pattern = '|'.join(patterns)
content = re.sub(combined_pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\n\s*\n', '\n\n', content).strip()
words = content.split()

counts = {}
for word in words:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf = {word: count / len(words) for word, count in counts.items()}
idf = {word: math.log10(1/(1+1)) for word in counts}
tfidf = {word: tf[word] * idf[word] for word in tf}

df = pd.DataFrame(list(tf.items()), columns=['Word', 'TF'])
df2 = pd.DataFrame(list(idf.items()), columns= ['Word', 'IDF'])
df3 = pd.DataFrame(list(tfidf.items()), columns= ['Word', 'TF-IDF'])


dfsort = df.sort_values(by='TF', ascending=True).reset_index(drop=True)
df2sort = df2.sort_values(by='IDF', ascending=True).reset_index(drop=True)

final = df3.sort_values(by='TF-IDF', ascending=True).reset_index(drop=True)
print(final)
print(final.head(50))






             Word    TF-IDF
0             the -0.013043
1              to -0.010635
2              of -0.006678
3               i -0.006479
4               a -0.005948
...           ...       ...
6008      advance -0.000005
6009   proclaimed -0.000005
6010       errand -0.000005
6011  conspirator -0.000005
6012   newsletter -0.000005

[6013 rows x 2 columns]
      Word    TF-IDF
0      the -0.013043
1       to -0.010635
2       of -0.006678
3        i -0.006479
4        a -0.005948
5      she -0.005784
6       it -0.005164
7      you -0.005040
8      her -0.005015
9      and -0.004901
10    that -0.003888
11      in -0.003654
12     was -0.002855
13     had -0.002746
14    jane -0.002661
15    with -0.002473
16     for -0.002319
17      be -0.002180
18      as -0.002001
19    have -0.001802
20    this -0.001787
21     not -0.001768
22      at -0.001678
23      on -0.001658
24      me -0.001614
25  judith -0.001450
26      we -0.001415
27    miss -0.001321
28  marian -0.001271
29      i

In [105]:
import re
import pandas as pd
import math

with open("basketball.txt", "r", encoding="utf-8") as file:
    content = file.read()

content = content.lower()
content = re.sub(r'[^\w\s]', '', content)

patterns = [
    r'\b(?:license|copyright|disclaimer|terms|conditions|legal|acknowledgments)\b',
    r'\b(?:table of contents|contents|index)\b',
    r'\b(?:author|editor|publisher|contributor|credits)\b',
    r'\b(?:dedication|foreword|preface|introduction)\b',
    r'\b(?:epilogue|afterword|appendix)\b'
]
combined_pattern = '|'.join(patterns)
content = re.sub(combined_pattern, '', content, flags=re.IGNORECASE)
content = re.sub(r'\n\s*\n', '\n\n', content).strip()

words = content.split()

bigrams = [' '.join([words[i], words[i+1]]) for i in range(len(words)-1)]
trigrams = [' '.join([words[i], words[i+1], words[i+2]]) for i in range(len(words)-2)]

total = words + bigrams + trigrams

counts = {}
for i in total:
    if i in counts:
        counts[i] += 1
    else:
        counts[i] = 1

tf = {i: count / len(total) for i, count in counts.items()}

idf = {i: math.log10(1 / (1 + 1)) for i in counts}

tfidf = {i: tf[i] * idf[i] for i in tf}

df = pd.DataFrame(list(tf.items()), columns=['Ngram', 'TF'])
df2 = pd.DataFrame(list(idf.items()), columns=['Ngram', 'IDF'])
df3 = pd.DataFrame(list(tfidf.items()), columns=['Ngram', 'TF-IDF'])

final = df3.sort_values(by='TF-IDF', ascending=True).reset_index(drop=True)

print(final.tail(20))


                      Ngram    TF-IDF
94728         suddenly that -0.000002
94729         injustice mrs -0.000002
94730       about injustice -0.000002
94731          speaking you -0.000002
94732        plain speaking -0.000002
94733         cousins plain -0.000002
94734           merited her -0.000002
94735         fully merited -0.000002
94736             cloth now -0.000002
94737           whole cloth -0.000002
94738              of whole -0.000002
94739           trouble out -0.000002
94740  manufactured trouble -0.000002
94741      had manufactured -0.000002
94742       arraignment she -0.000002
94743      this arraignment -0.000002
94744         deserved this -0.000002
94745       marian deserved -0.000002
94746          letter after -0.000002
94747      about new ebooks -0.000002
