In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Download tokenizer models (only once)
nltk.download('punkt')

block = "CSI DYPIEMR is the student chapter of the Computer Society of India in Dr. D. Y. Patil Pratishthan's Dr. D. Y. Patil Institute of Engineering Management and Research. Computer Society of India is a body of computer professionals in India. It was started on 6 March 1965 by a few computer professionals and has now grown to be the national body representing computer professionals. It has 72 chapters across India, and 511 student branches."

# Word-wise tokenization
print("This is word-wise tokenization:\n")
print(word_tokenize(block))
print("\n------------------------------------------------------")

# Sentence-wise tokenization
print("\nThis is sentence-wise tokenization:\n")
print(sent_tokenize(block))


In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Required for tokenization
nltk.download('stopwords')  # This is where your issue is

block = "CSI DYPIEMR is the student chapter of the Computer Society of India in Dr. D. Y. Patil Pratishthan's Dr. D. Y. Patil Institute of Engineering Management and Research. Computer Society of India is a body of computer professionals in India. It was started on 6 March 1965 by a few computer professionals and has now grown to be the national body representing computer professionals. It has 72 chapters across India, and 511 student branches."
stop_words = stopwords.words('english')
print("Stopwords:", stop_words)

token = word_tokenize(block)
cleaned_token = []

for word in token:
    if word.lower() not in stop_words:
        cleaned_token.append(word)

print("This is the unclean version:\n", token, '\n')
print("-------------------------------------------------------\n")
print("This is the cleaned version:\n", cleaned_token, '\n')


In [None]:
from nltk.stem import PorterStemmer
stemmer=nltk.PorterStemmer()
words=['rain','rained','raining','rains']
stemmed=[stemmer.stem(word) for word in words]
print(stemmed)
         

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer=nltk.WordNetLemmatizer()
lemmatized=[lemmatizer.lemmatize(word) for word in cleaned_token]
print(lemmatized)

In [None]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
tagged=nltk.pos_tag(cleaned_token)
print(tagged)

In [None]:
import pandas as pd
import sklearn as sk
import math

In [None]:
block_1 = "Our aim is to develop a good work culture among students, a culture where students from various technical backgrounds"
block_2 = "Keeping in mind the interest of the IT professionals and computer enthusiasts, CSI works towards making the profession"

#split so each word have their own string
first_block = block_1.split(" ")
second_block = block_2.split(" ")

#join them to remove common duplicate words
total = set(first_block).union(set(second_block))
print(total)


In [None]:
wordDictA = dict.fromkeys(total, 0)
wordDictB = dict.fromkeys(total, 0)

for word in first_block:
    wordDictA[word] += 1

for word in second_block:
    wordDictB[word] += 1

pd.DataFrame([wordDictA, wordDictB])


In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

filtered_sentence = [w for w in wordDictA if w not in stop_words]
print(filtered_sentence)


In [None]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count / float(corpusCount)
    return tfDict

# Running our sentences through the tf function:
tfFirst = computeTF(wordDictA, first_block)
tfSecond = computeTF(wordDictB, second_block)
tf = pd.DataFrame([tfFirst, tfSecond])
print(tf)


In [None]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    idfDict = dict.fromkeys(docList[0].keys(), 0)

    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))
    
    return idfDict

idfs = computeIDF([wordDictA, wordDictB])
idf1 = pd.DataFrame([wordDictA, wordDictB])
print(idf1)


In [None]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val * idfs[word]
    return tfidf

# Running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)

# Putting it in a DataFrame
idf = pd.DataFrame([idfFirst, idfSecond])
print(idf)
