- Imdb Movie sentiment analysis
- Text Generation with Long Short-Term Memory (LSTM) Networks: Create a text generation system using LSTM-based RNNs that 
    can generate coherent and contextually relevant text 
    based on a given input or prompt.
- Language Translation with Sequence-to-Sequence Models: Implement a language translation system using RNN-based 
sequence-to-sequence models, such as Encoder-Decoder architecture with Long Short-Term Memory (LSTM) cells, 
capable of translating text between different languages.
- Sequence-to-Sequence with Attention Model: Implement a sequence-to-sequence model with attention mechanisms for abstractive text summarization. This model serves as the base architecture for generating summaries from the input text, allowing the system to focus on relevant parts of the document while generating summaries.

# NLP imdb Sentiment Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn


In [2]:
df=pd.read_csv("IMDB Dataset.csv")

In [3]:
df['sentiment'].value_counts() # Balanced dataset

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
df.duplicated().sum()

418

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.duplicated().sum()

0

In [8]:
df.shape

(49582, 2)

In [9]:
# df=df.sample(5000)

In [10]:
# df

# Text Preprocessing

# Lower Casing

In [11]:
# df['review']

In [12]:
# df['review'][3].lower()

In [13]:
# df['review'].str.lower() # for entire dataset

In [14]:
df['review']=df['review'].str.lower()

# Remove html tag

In [15]:
# regex (regular expression)

In [16]:
import re
def removal_html_tag(text):
    fun=re.compile('<.*?>')
    return fun.sub(r'',text)

In [17]:
# df['review'].apply(removal_html_tag)

In [18]:
df['review']=df['review'].apply(removal_html_tag)

In [19]:
# df['review'][3]

 ## url removal

In [20]:
import re
def removal_url(text):
    fun=re.compile(r'https?://\S+|www\.\S+')
    return fun.sub(r'',text)

In [21]:
df['review']=df['review'].apply(removal_url)

## remove punctuation


In [22]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [23]:
exclude=string.punctuation

In [24]:
def removal_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [25]:
df['review']=df['review'].apply(removal_punc1)

In [26]:
df.sample(5)

Unnamed: 0,review,sentiment
28304,the tempest has been interpreted in many diffe...,positive
48477,young couple on the road minding their own bus...,negative
29822,at least for me and rather unexpected as subje...,positive
46982,i agree with the above comment i love the real...,positive
24613,great job was very exciting and had great stun...,positive


## Spelling correction

In [27]:
from textblob import TextBlob # dont do inefficent solution

incorrect='The Car is on The bkside of rood'
word=TextBlob(incorrect)
word.correct().string



'The War is on The beside of room'

## removing stop word
# we cant remove stop word in pos tagging

In [28]:
from nltk.corpus import stopwords
sw_list=stopwords.words("english")

In [29]:


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Sagar
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sagar
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
# def remove_stopwords(text):
#     new_text=[]
#     for word in text.split():
#         if word in stopwords.words('english'):
#             new_text.append('')
#         else:
#             new_text.append(word)
#     x=new_text[:]
#     new_text.clear()
#     return " ".join(x)
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in stop_words:
            new_text.append(word)
    return " ".join(new_text)


In [31]:
# Remove stopwords


In [32]:
df['review']=df['review'].apply(remove_stopwords)

In [33]:
# import emoji

# def convert_emojis(text):
#     def replace_emoji(match):
#         emoji_code = match.group(0)
#         emoji_text = emoji.demojize(emoji_code)
#         return emoji_text

#     # Regular expression to match emojis
#     emoji_regex = r'[\U0001F300-\U0001F5FF\U0001F600-\U0001F64F\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U00002639\U00002640\U00002648\U0000264F\U00002653\U0001F170-\U0001F251\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U00002639\U00002640\U00002648\U0000264F\U00002653\U0001F170-\U0001F251]+'

#     # Replace emojis with their text representations
#     result_text = emoji.demojize(text)
#     return result_text

# # Example usage

# df['review']=df['review'].apply(convert_emojis)
import emoji
import re

# Compile regex pattern for emojis
emoji_regex = re.compile(r'[\U0001F300-\U0001F5FF\U0001F600-\U0001F64F\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U00002639\U00002640\U00002648\U0000264F\U00002653\U0001F170-\U0001F251\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U00002639\U00002640\U00002648\U0000264F\U00002653\U0001F170-\U0001F251]+')

def convert_emojis(text):
    # Replace emojis with their text representations
    result_text = emoji.demojize(text)
    return result_text

# Example usage
df['review'] = df['review'].apply(convert_emojis)


# Remove emoji

In [34]:
# import re

# def remove_emojis(text):
#     """
#     Function to remove emojis from the input text.
    
#     Parameters:
#         text (str): The input text from which emojis will be removed.
    
#     Returns:
#         str: The text with emojis removed.
#     """
#     # Define regex pattern to match emojis
#     emoji_pattern = re.compile("["
#                                u"\U0001F600-\U0001F64F"  # emoticons
#                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                u"\U00002500-\U00002BEF"  # chinese char
#                                u"\U00002702-\U000027B0"
#                                u"\U00002702-\U000027B0"
#                                u"\U000024C2-\U0001F251"
#                                u"\U0001f926-\U0001f937"
#                                u"\U00010000-\U0010ffff"
#                                u"\u2640-\u2642"
#                                u"\u2600-\u2B55"
#                                u"\u200d"
#                                u"\u23cf"
#                                u"\u23e9"
#                                u"\u231a"
#                                u"\ufe0f"  # dingbats
#                                u"\u3030"
#                                "]+", flags=re.UNICODE)
    
#     # Remove emojis from the text
#     text_without_emojis = emoji_pattern.sub(r'', text)
    
#     return text_without_emojis

# # Example usage:
# text_with_emojis = "Hello! 😀 This is a sample text with emojis! 🚀🎉"
# text_without_emojis = remove_emojis(text_with_emojis)
# print(text_without_emojis)


In [35]:
# df['review']=df['review'].apply(remove_emojis)

## chat word treatment

In [36]:
slang=pd.read_csv("slangs.csv")

In [37]:
import csv

def csv_to_dict(csv_file):
    data_dict = {}
    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header if present
        for row in reader:
            data_dict[row[0]] = row[1]
    return data_dict

# Example usage
chat_words = csv_to_dict('slangs.csv')



In [38]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in  chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text) 

In [39]:
df['review']=df['review'].apply(chat_conversion)

In [40]:
# df['review'][9]

In [41]:
y=df['sentiment']
x=df.iloc[:,0:1]

In [42]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()
y=label_encoder.fit_transform(y)

In [43]:
# stemming

In [44]:
# import pandas as pd
# from nltk.stem import PorterStemmer
# from nltk.tokenize import word_tokenize



# # Initialize the Porter Stemmer
# stemmer = PorterStemmer()

# # Define a function to apply stemming to a sentence
# def stem_sentence(sentence):
#     tokens = word_tokenize(sentence)  # Tokenize the sentence
#     stemmed_tokens = [stemmer.stem(token) for token in tokens]  # Stem each token
#     return " ".join(stemmed_tokens)  # Join stemmed tokens back into a sentence

# # Apply stemming to the 'text' column using the apply function
# x = x['review'].apply(stem_sentence)

import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Define a function to apply stemming to a sentence
def stem_sentence(sentence):
    return " ".join([stemmer.stem(word) for word in word_tokenize(sentence)])

# Apply stemming to the 'review' column using the apply function
x['review'] = x['review'].apply(stem_sentence)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['review'] = x['review'].apply(stem_sentence)


In [45]:
x=pd.DataFrame(x,columns=['review'])

In [46]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [47]:
X_train.shape

(39665, 1)

In [49]:
# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()
X_train_bow.shape

(39665, 10000)

In [50]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)
GaussianNB()
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.7010184531612382

In [51]:
confusion_matrix(y_test,y_pred)

array([[4356,  677],
       [2288, 2596]], dtype=int64)

In [None]:
# from sklearn.metrics import accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier

# # Initialize classifiers
# classifiers = {
#     "Naive Bayes": MultinomialNB(),
#     "Logistic Regression": LogisticRegression(),
#     "Random Forest": RandomForestClassifier(),
    
# }

# # Calculate and print accuracy scores for each classifier
# for name, clf in classifiers.items():
#     clf.fit(X_train_bow, y_train)
#     y_pred = clf.predict(X_test_bow)
#     accuracy = accuracy_score(y_test, y_pred)
#     print(f"{name} Accuracy: {accuracy}")
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from multiprocessing import Process, Manager

# Function to train and predict for a classifier
def train_and_predict(clf, X_train, y_train, X_test, y_test, result_dict):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    result_dict[clf.__class__.__name__] = accuracy

# Initialize classifiers
classifiers = [
    MultinomialNB(),
    LogisticRegression(),
    RandomForestClassifier(),
]

# Dictionary to store accuracy scores
manager = Manager()
accuracy_scores = manager.dict()

# Train and predict for each classifier concurrently
processes = []
for clf in classifiers:
    p = Process(target=train_and_predict, args=(clf, X_train_bow, y_train, X_test_bow, y_test, accuracy_scores))
    p.start()
    processes.append(p)

# Wait for all processes to finish
for p in processes:
    p.join()

# Print accuracy scores
for clf_name, accuracy in accuracy_scores.items():
    print(f"{clf_name} Accuracy: {accuracy}")



In [None]:
# Featurees Reduction

In [None]:
cv = CountVectorizer(max_features=10000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
cv = CountVectorizer(max_features=10000)

# Transform the training and testing data into bag-of-words representations
X_train_bow = cv.fit_transform(X_train['review'])
X_test_bow = cv.transform(X_test['review'])

# Initialize and train Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train_bow, y_train)

# Predict using Naive Bayes classifier
nb_pred = nb.predict(X_test_bow)

# Calculate accuracy for Naive Bayes classifier
nb_accuracy = accuracy_score(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)

# Initialize and train Logistic Regression classifier
lr = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
lr.fit(X_train_bow, y_train)

# Predict using Logistic Regression classifier
lr_pred = lr.predict(X_test_bow)

# Calculate accuracy for Logistic Regression classifier
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy:", lr_accuracy)

# Initialize and train Random Forest classifier
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)

# Predict using Random Forest classifier
rf_pred = rf.predict(X_test_bow)

# Calculate accuracy for Random Forest classifier
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", rf_accuracy)



In [None]:

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
cv = CountVectorizer(max_features=12000)

# Transform the training and testing data into bag-of-words representations
X_train_bow = cv.fit_transform(X_train['review'])
X_test_bow = cv.transform(X_test['review'])

# Initialize classifiers
nb = MultinomialNB()
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier()

# Fit classifiers
nb.fit(X_train_bow, y_train)
lr.fit(X_train_bow, y_train)
rf.fit(X_train_bow, y_train)

# Create VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('nb', nb), ('lr', lr), ('rf', rf)],
    voting='hard'  # Use majority voting
)

# Fit VotingClassifier
voting_clf.fit(X_train_bow, y_train)

# Predict using VotingClassifier
voting_pred = voting_clf.predict(X_test_bow)

# Calculate accuracy for VotingClassifier
voting_accuracy = accuracy_score(y_test, voting_pred)
print("Voting Classifier Accuracy:", voting_accuracy)


# Calculate accuracy f



In [None]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
cv = CountVectorizer(ngram_range=(1,2), max_features=5000)

# Transform the training and testing data into bag-of-words representations
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

# Initialize and train Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train_bow, y_train)

# Predict using Naive Bayes classifier
nb_pred = nb.predict(X_test_bow)

# Calculate accuracy for Naive Bayes classifier
nb_accuracy = accuracy_score(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)

# Initialize and train Logistic Regression classifier
lr = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
lr.fit(X_train_bow, y_train)

# Predict using Logistic Regression classifier
lr_pred = lr.predict(X_test_bow)

# Calculate accuracy for Logistic Regression classifier
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy:", lr_accuracy)

# Initialize and train Random Forest classifier
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)

# Predict using Random Forest classifier
rf_pred = rf.predict(X_test_bow)

# Calculate accuracy for Random Forest classifier
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", rf_accuracy)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
cv = CountVectorizer(ngram_range=(1,2), max_features=5000)

# Transform the training and testing data into bag-of-words representations
X_train_bow = cv.fit_transform(X_train['review'])
X_test_bow = cv.transform(X_test['review'])

# Initialize classifiers
nb = MultinomialNB()
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier()

# Fit classifiers
nb.fit(X_train_bow, y_train)
lr.fit(X_train_bow, y_train)
rf.fit(X_train_bow, y_train)

# Create VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('nb', nb), ('lr', lr), ('rf', rf)],
    voting='hard'  # Use majority voting
)

# Fit VotingClassifier
voting_clf.fit(X_train_bow, y_train)

# Predict using VotingClassifier
voting_pred = voting_clf.predict(X_test_bow)

# Calculate accuracy for VotingClassifier
voting_accuracy = accuracy_score(y_test, voting_pred)
print("Voting Classifier Accuracy:", voting_accuracy)


In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer()
# X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
# X_test_tfidf = tfidf.transform(X_test['review'])
# rf = RandomForestClassifier()

# rf.fit(X_train_tfidf,y_train)
# y_pred = rf.predict(X_test_tfidf)

# accuracy_score(y_test,y_pred)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize TfidfVectorizer with n_jobs parameter
tfidf = TfidfVectorizer(n_jobs=-1)
X_train_tfidf = tfidf.fit_transform(X_train['review'])
X_test_tfidf = tfidf.transform(X_test['review'])

# Initialize RandomForestClassifier with n_jobs parameter
rf = RandomForestClassifier(n_jobs=-1)

# Fit the model and make predictions
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:


from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

# Initialize classifiers
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
}

# Calculate and print accuracy scores for each classifier
for name, clf in classifiers.items():
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")



In [None]:
# Voting sTacking Technique

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you have X_train_tfidf, X_test_tfidf, y_train, and y_test defined

# Initialize base classifiers
base_classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
}

# Initialize meta-classifier
meta_classifier = KNeighborsClassifier()

# Train base classifiers and make predictions
predictions = {}
for name, clf in base_classifiers.items():
    clf.fit(X_train_tfidf, y_train)
    predictions[name] = clf.predict(X_test_tfidf).reshape(-1, 1)  # Reshape predictions

# Stack predictions horizontally to create new features
X_stacked = np.hstack(list(predictions.values()))

# Train meta-classifier using stacked predictions
meta_classifier.fit(X_stacked, y_test)

# Make stacked predictions for test data
stacked_predictions = np.hstack(list(predictions.values()))

# Evaluate meta-classifier accuracy
meta_accuracy = accuracy_score(y_test, meta_classifier.predict(stacked_predictions))
print(f"Stacked Classifier Accuracy: {meta_accuracy}")


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Initialize classifiers
classifiers = [
    ("Naive Bayes", MultinomialNB()),
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest", RandomForestClassifier())
]

# Create VotingClassifier
voting_clf = VotingClassifier(estimators=classifiers, voting='hard', n_jobs=-1)

# Train VotingClassifier
voting_clf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test_tfidf)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Voting Classifier Accuracy:", accuracy)


# Tokeniazation

In [None]:
# import pandas as pd
# import spacy

# # Load the English tokenizer
# nlp = spacy.load("en_core_web_sm")

# # Define a function to tokenize text
# def tokenize_text(text):
#     doc = nlp(text)
#     return [token.text for token in doc]

# # Apply tokenization on the 'text' column using the apply function
# x = x.apply(tokenize_text)

# # Display the DataFrame with tokenized text
# print(x)


In [None]:
# !python -m spacy download en_core_web_sm

# Text Transformation

In [None]:

# from sklearn.feature_extraction.text import TfidfVectorizer


# # Initialize the TF-IDF vectorizer
# tfidf_vectorizer = TfidfVectorizer()

# # Fit and transform the documents to TF-IDF matrix
# tfidf_matrix = tfidf_vectorizer.fit_transform(X)

# # Convert the TF-IDF matrix to a DataFrame (optional)
# import pandas as pd
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# # Display the TF-IDF matrix
# print(tfidf_df)
