# Importing the modules

In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
df = pd.read_csv("/kaggle/input/customer-support-on-twitter/twcs/twcs.csv")

In [3]:
df.shape

(2811774, 7)

In [4]:
df = df.head(100000)

In [5]:
df["text"]

0        @115712 I understand. I would like to assist y...
1            @sprintcare and how do you propose we do that
2        @sprintcare I have sent several private messag...
3        @115712 Please send us a Private Message so th...
4                                       @sprintcare I did.
                               ...                        
99995    @144297 We understand you have received a diff...
99996    @AmazonHelp I have already raised d issue ther...
99997    @144297 If you've shared your details via the ...
99998    @AmazonHelp It was a great help from ur side w...
99999    @144297 Thank you for your understanding. As i...
Name: text, Length: 100000, dtype: object

# Lowercasing

In [6]:
def lower_case(text):
    return text.lower()

In [7]:
df["text"] = df["text"].apply(lower_case)

In [8]:
df["text"]

0        @115712 i understand. i would like to assist y...
1            @sprintcare and how do you propose we do that
2        @sprintcare i have sent several private messag...
3        @115712 please send us a private message so th...
4                                       @sprintcare i did.
                               ...                        
99995    @144297 we understand you have received a diff...
99996    @amazonhelp i have already raised d issue ther...
99997    @144297 if you've shared your details via the ...
99998    @amazonhelp it was a great help from ur side w...
99999    @144297 thank you for your understanding. as i...
Name: text, Length: 100000, dtype: object

# Remove HTML tags

In [9]:
def remove_html_tags(text):
    pattern = re.compile("<.*?>")
    return pattern.sub("",text)

In [10]:
df["text"] = df["text"].apply(remove_html_tags)

# Removing URL

In [11]:
def remove_url(text):
    pattern = re.compile(r"https?://\S+|www\.\S+")
    return pattern.sub("",text)

In [12]:
df["text"] = df["text"].apply(remove_url)

In [13]:
df["text"]

0        @115712 i understand. i would like to assist y...
1            @sprintcare and how do you propose we do that
2        @sprintcare i have sent several private messag...
3        @115712 please send us a private message so th...
4                                       @sprintcare i did.
                               ...                        
99995    @144297 we understand you have received a diff...
99996    @amazonhelp i have already raised d issue ther...
99997    @144297 if you've shared your details via the ...
99998    @amazonhelp it was a great help from ur side w...
99999    @144297 thank you for your understanding. as i...
Name: text, Length: 100000, dtype: object

# Remove Emails

In [14]:
def remove_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.sub(email_pattern, '', text)

In [15]:
df["text"] = df["text"].apply(remove_emails)

# Contractions To Expansion

In [16]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [17]:
def cont_to_exp(text):
    if type(text) is str:
        for key in contractions:
            value = contractions[key]
            text = text.replace(key,value)
        return text
    else:
        return text

In [18]:
df["text"] = df["text"].apply(cont_to_exp)

In [19]:
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 i understand. i would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare i have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 please send us a private message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare i did.,4.0,6.0


# Remove Emoji

In [20]:
import emoji

In [21]:
def remove_emoji(text):
    return emoji.demojize(text)

In [22]:
df["text"] = df["text"].apply(remove_emoji)

# Remove numbers

In [23]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [24]:
df["text"] = df["text"].apply(remove_numbers)

In [25]:
df["text"]

0        @ i understand. i would like to assist you. we...
1            @sprintcare and how do you propose we do that
2        @sprintcare i have sent several private messag...
3        @ please send us a private message so that we ...
4                                       @sprintcare i did.
                               ...                        
99995    @ we understand you have received a different ...
99996    @amazonhelp i have already raised d issue ther...
99997    @ if you've shared your details via the above ...
99998    @amazonhelp it was a great help from your side...
99999    @ thank you for your understanding. as informe...
Name: text, Length: 100000, dtype: object

# Remove Multiple Spaces

In [26]:
df["text"]

0        @ i understand. i would like to assist you. we...
1            @sprintcare and how do you propose we do that
2        @sprintcare i have sent several private messag...
3        @ please send us a private message so that we ...
4                                       @sprintcare i did.
                               ...                        
99995    @ we understand you have received a different ...
99996    @amazonhelp i have already raised d issue ther...
99997    @ if you've shared your details via the above ...
99998    @amazonhelp it was a great help from your side...
99999    @ thank you for your understanding. as informe...
Name: text, Length: 100000, dtype: object

In [27]:
df["text"] = df["text"].apply(lambda x: " ".join(x.split()))

In [28]:
df["text"]

0        @ i understand. i would like to assist you. we...
1            @sprintcare and how do you propose we do that
2        @sprintcare i have sent several private messag...
3        @ please send us a private message so that we ...
4                                       @sprintcare i did.
                               ...                        
99995    @ we understand you have received a different ...
99996    @amazonhelp i have already raised d issue ther...
99997    @ if you've shared your details via the above ...
99998    @amazonhelp it was a great help from your side...
99999    @ thank you for your understanding. as informe...
Name: text, Length: 100000, dtype: object

# Remove Punctuations

In [29]:
def remove_punc(text):
    return text.translate(str.maketrans("","",string.punctuation))

In [30]:
df["text"] = df["text"].apply(remove_punc)

In [31]:
df["text"]

0         i understand i would like to assist you we wo...
1             sprintcare and how do you propose we do that
2        sprintcare i have sent several private message...
3         please send us a private message so that we c...
4                                         sprintcare i did
                               ...                        
99995     we understand you have received a different p...
99996    amazonhelp i have already raised d issue there...
99997     if youve shared your details via the above li...
99998    amazonhelp it was a great help from your side ...
99999     thank you for your understanding as informed ...
Name: text, Length: 100000, dtype: object

# Removal Of Stop Words

In [32]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [33]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [34]:
nlp = spacy.load("en_core_web_sm")

In [35]:
def remove_stopwords(text):
    new_text = []
    for word in word_tokenize(text):
        if word not in stopwords.words("english"):
            new_text.append(word)
    return " ".join(new_text)

In [36]:
# def remove_stopwords(text):
#     doc = nlp(text)
#     return " ".join([x.text for x in doc if x not in STOP_WORDS])

In [37]:
df["text"] = df["text"].apply(remove_stopwords)

In [38]:
df["text"]

0        understand would like assist would need get pr...
1                                       sprintcare propose
2        sprintcare sent several private messages one r...
3        please send us private message assist click ‘ ...
4                                               sprintcare
                               ...                        
99995    understand received different product ordered ...
99996    amazonhelp already raised issue therelets see ...
99997    youve shared details via link well certainly r...
99998    amazonhelp great help side pre scripted repeat...
99999    thank understanding informed earlier ensuring ...
Name: text, Length: 100000, dtype: object

### Stemming

In [39]:
from nltk.stem import PorterStemmer

In [40]:
# def stemming(text):
#     ps = PorterStemmer()
#     return " ".join([ps.stem(word) for word in word_tokenize(text)])

In [41]:
def stemming(text):
    doc = nlp(text)
    return " ".join([x.lemma_ for x in doc])

In [42]:
df["text"] = df["text"].apply(stemming)

In [43]:
df["text"]

0        understand would like assist would need get pr...
1                                       sprintcare propose
2        sprintcare send several private message one re...
3        please send we private message assist click ' ...
4                                               sprintcare
                               ...                        
99995    understand receive different product order pub...
99996    amazonhelp already raise issue therelet see ha...
99997    you ve share detail via link well certainly re...
99998    amazonhelp great help side pre script repeat a...
99999    thank understand informed early ensure securit...
Name: text, Length: 100000, dtype: object

### One Hot Encoding

**Advantages of One-Hot Encoding in NLP:**

1. **Simplicity and Intuitiveness:** One-hot encoding is a simple and intuitive method to represent categorical data, including text. Each word in the vocabulary is represented as a unique binary vector, making it easy to understand and implement.

2. **Preservation of Word Relationships:** One-hot encoding preserves the relationship between words in the vocabulary. Each word is represented as a unique vector, and the absence or presence of each word is explicitly captured.

3. **Compatibility with Machine Learning Models:** One-hot encoded vectors are easily fed into machine learning models that expect fixed-size input features. Many traditional machine learning algorithms and neural network architectures can work directly with one-hot encoded data.

4. **Insensitivity to Word Order:** One-hot encoding treats words as independent entities, making it insensitive to the order of words in a sentence. This can be an advantage in scenarios where word order is less important, such as in bag-of-words models.

5. **Sparse Representation:** One-hot encoding results in sparse vectors, where most elements are zero. This can be memory-efficient, especially when dealing with large vocabularies, as only the non-zero elements need to be stored.

**Disadvantages of One-Hot Encoding in NLP:**

1. **High Dimensionality:** One-hot encoding can lead to high-dimensional feature vectors, especially when dealing with large vocabularies. This high dimensionality can pose challenges in terms of computational efficiency and memory requirements.

2. **Loss of Semantic Information:** One-hot encoding does not capture semantic relationships between words. Each word is treated as a unique entity, and the model is unaware of similarities or relationships between words.

3. **Inefficiency in Handling Out-of-Vocabulary Words:** One-hot encoding struggles with out-of-vocabulary words, i.e., words that were not present in the training data. Handling such words efficiently may require additional techniques, such as using an "unknown" token.

4. **Lack of Contextual Information:** One-hot encoding does not encode any contextual information about the words, such as their position in a sentence or their relationship with neighboring words. This limitation can be critical for tasks that require understanding context, like sentiment analysis or machine translation.

5. **Not Suitable for Continuous Representations:** One-hot encoding is a discrete representation, and it cannot capture continuous relationships between words. More advanced embeddings like word embeddings (e.g., Word2Vec, GloVe) are designed to address this limitation by representing words in a continuous vector space.

In [44]:
import tensorflow 
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

2024-02-08 17:06:26.797302: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-08 17:06:26.797400: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-08 17:06:26.799254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [45]:
text_one = df["text"].tolist()
print(text_one[:20])

['understand would like assist would need get private secure link assist', 'sprintcare propose', 'sprintcare send several private message one respond usual', "please send we private message assist click ' message ' top profile", 'sprintcare', 'please send we private message gain detail account', 'sprintcare bad customer service', 'saddening hear please shoot we dm look kc', 'sprintcare go to magically change connectivity whole family lyingface hundredpoint', 'understand concern we d like please send we direct message assist aa', 'sprintcare since sign yousince day', 'h we d definitely like work long experience issue aa', '\' lie " great " connection bar lte still \' load something smh', 'please send private message send link access account fr', 'whenever contact customer support tell shortcode enable account never year try', 'information incorrect jk', 'askspectrum would like email copy one since spectrum update training', 'department part corporate office you re particular area go for

In [46]:
tokenizer = Tokenizer()

In [47]:
tokenizer.fit_on_texts(text_one)

In [48]:
one_hot_results = tokenizer.texts_to_matrix(text_one, mode='binary')
one_hot_results

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

### Bag Of Words

**Advantages of Bag-of-Words (BoW) Model:**

1. **Simplicity:** The Bag-of-Words model is a straightforward and easy-to-understand representation of text. It simplifies complex textual data into a simple vector of word frequencies, making it accessible for various machine learning algorithms.

2. **Computational Efficiency:** BoW models are computationally efficient, especially when dealing with large datasets. The simplicity of the model allows for faster processing and training times compared to more complex models.

3. **Language Agnosticism:** The BoW model is language-agnostic, meaning it can be applied to texts in different languages without requiring language-specific adaptations. This makes it versatile for tasks involving multilingual data.

4. **Interpretability:** The resulting feature vectors from BoW models are interpretable. Each element in the vector corresponds to the frequency of a specific word in the document, making it easy to interpret and analyze the importance of individual words.

5. **Versatility in Downstream Tasks:** BoW representations can be effectively used in various natural language processing (NLP) tasks such as text classification, sentiment analysis, and information retrieval. They serve as a foundational representation that works well in many scenarios.

**Disadvantages of Bag-of-Words (BoW) Model:**

1. **Loss of Word Order:** BoW discards the order of words in a document, treating each document as an unordered set of words. This limitation is significant for tasks where word order and context play a crucial role, such as language modeling and machine translation.

2. **Sparsity:** BoW representations can lead to sparse high-dimensional vectors, especially when dealing with large vocabularies. This sparsity can affect the efficiency of machine learning algorithms and increase the need for more advanced techniques for handling sparse data.

3. **Lack of Semantic Information:** BoW models do not capture the semantic relationships between words. Words with similar meanings may be treated as unrelated if they don't co-occur frequently in the same context.

4. **Limited Contextual Understanding:** BoW models ignore the context in which words appear within a document. They don't consider the relationships between words or their positions in sentences, which is a limitation for tasks requiring contextual understanding.

5. **Inability to Handle Synonyms:** BoW treats synonyms as distinct words, and it does not inherently capture their semantic equivalence. This limitation can impact the model's ability to generalize across different ways of expressing the same concept.

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
cv = CountVectorizer()

In [51]:
text_cv = df["text"].tolist()
print(text_cv[:20])

['understand would like assist would need get private secure link assist', 'sprintcare propose', 'sprintcare send several private message one respond usual', "please send we private message assist click ' message ' top profile", 'sprintcare', 'please send we private message gain detail account', 'sprintcare bad customer service', 'saddening hear please shoot we dm look kc', 'sprintcare go to magically change connectivity whole family lyingface hundredpoint', 'understand concern we d like please send we direct message assist aa', 'sprintcare since sign yousince day', 'h we d definitely like work long experience issue aa', '\' lie " great " connection bar lte still \' load something smh', 'please send private message send link access account fr', 'whenever contact customer support tell shortcode enable account never year try', 'information incorrect jk', 'askspectrum would like email copy one since spectrum update training', 'department part corporate office you re particular area go for

In [52]:
text_cv = cv.fit_transform(text_cv)
text_cv

<100000x48605 sparse matrix of type '<class 'numpy.int64'>'
	with 998819 stored elements in Compressed Sparse Row format>

In [53]:
cv.vocabulary_

{'understand': 42054,
 'would': 45028,
 'like': 23075,
 'assist': 3012,
 'need': 26857,
 'get': 16521,
 'private': 31512,
 'secure': 35574,
 'link': 23152,
 'sprintcare': 37844,
 'propose': 31771,
 'send': 35760,
 'several': 36005,
 'message': 25177,
 'one': 28477,
 'respond': 33886,
 'usual': 42707,
 'please': 30576,
 'we': 44017,
 'click': 7590,
 'top': 40859,
 'profile': 31657,
 'gain': 16103,
 'detail': 10789,
 'account': 289,
 'bad': 3795,
 'customer': 9541,
 'service': 35891,
 'saddening': 34890,
 'hear': 18096,
 'shoot': 36273,
 'dm': 11607,
 'look': 23577,
 'kc': 21654,
 'go': 16834,
 'to': 40644,
 'magically': 24109,
 'change': 6851,
 'connectivity': 8474,
 'whole': 44450,
 'family': 14496,
 'lyingface': 23927,
 'hundredpoint': 19065,
 'concern': 8276,
 'direct': 11211,
 'aa': 0,
 'since': 36595,
 'sign': 36481,
 'yousince': 45624,
 'day': 9974,
 'definitely': 10273,
 'work': 44918,
 'long': 23554,
 'experience': 14013,
 'issue': 20602,
 'lie': 23004,
 'great': 17098,
 'connec

In [54]:
text_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### N-Gram

**Advantages of N-grams in NLP:**

1. **Capturing Local Context:** N-grams capture local context by considering sequences of adjacent words. This helps in preserving some level of word order information, making them useful for tasks where local context matters, such as part-of-speech tagging or named entity recognition.

2. **Improved Feature Representation:** N-grams provide a more comprehensive feature representation compared to Bag-of-Words models. By considering sequences of words, they capture some syntactic and semantic information, addressing the limitation of word independence in Bag-of-Words.

3. **Enhanced Performance in Some Tasks:** N-grams can be beneficial for certain NLP tasks where the order of words is critical. For example, in tasks like machine translation or sentiment analysis, considering word sequences can lead to improved performance compared to models that ignore word order.

4. **Contextual Understanding:** N-grams contribute to a better understanding of the context in which words appear. This is particularly valuable for tasks requiring a nuanced comprehension of language, such as sentiment analysis or text summarization.

5. **Flexibility in Model Complexity:** The use of n-grams allows for flexibility in adjusting the model complexity. By choosing the appropriate n (the size of the n-gram), one can control the level of context considered, tailoring the model to the specific requirements of the task.

**Disadvantages of N-grams in NLP:**

1. **Increased Dimensionality:** N-grams can lead to a significant increase in the dimensionality of the feature space, especially when considering higher-order n-grams. This may result in sparsity issues and increased computational requirements.

2. **Data Sparsity:** Generating n-grams from limited data can lead to data sparsity issues, especially for higher-order n-grams. Some n-grams may occur infrequently or not at all, making it challenging for models to learn meaningful associations.

3. **Loss of Generalization:** Including higher-order n-grams may lead to overfitting, especially when working with limited training data. The model might memorize specific n-gram combinations that are present in the training set but do not generalize well to unseen data.

4. **Limited Semantic Understanding:** While n-grams capture local context, they may not capture long-range dependencies or complex semantic relationships between distant words. This limitation is more pronounced in tasks requiring a deep understanding of meaning and context.

5. **Increased Model Complexity:** Including higher-order n-grams can significantly increase the complexity of models, making them harder to interpret and potentially reducing their generalization to new data. Striking a balance between complexity and performance is essential.

In [55]:
cv13 = CountVectorizer(ngram_range=(1,3))

In [56]:
text_cv13 = df["text"].tolist()
print(text_cv13[:20])

['understand would like assist would need get private secure link assist', 'sprintcare propose', 'sprintcare send several private message one respond usual', "please send we private message assist click ' message ' top profile", 'sprintcare', 'please send we private message gain detail account', 'sprintcare bad customer service', 'saddening hear please shoot we dm look kc', 'sprintcare go to magically change connectivity whole family lyingface hundredpoint', 'understand concern we d like please send we direct message assist aa', 'sprintcare since sign yousince day', 'h we d definitely like work long experience issue aa', '\' lie " great " connection bar lte still \' load something smh', 'please send private message send link access account fr', 'whenever contact customer support tell shortcode enable account never year try', 'information incorrect jk', 'askspectrum would like email copy one since spectrum update training', 'department part corporate office you re particular area go for

In [57]:
text_cv13 = cv13.fit_transform(text_cv13)
text_cv13

<100000x1159156 sparse matrix of type '<class 'numpy.int64'>'
	with 2778535 stored elements in Compressed Sparse Row format>

In [58]:
text_cv13.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### TF-IDF

**Advantages of TF-IDF (Term Frequency-Inverse Document Frequency):**

1. **Term Importance:** TF-IDF assigns weights to terms based on their importance in a document relative to the entire corpus. This helps highlight key terms that are discriminative for a particular document but not necessarily common across all documents.

2. **Dimensionality Reduction:** TF-IDF can be seen as a form of dimensionality reduction compared to the Bag-of-Words model. By assigning weights to terms, it focuses on the most relevant features, reducing the impact of common but less informative words.

3. **Language Agnosticism:** Like Bag-of-Words, TF-IDF is language-agnostic and can be applied to texts in different languages without requiring language-specific modifications.

4. **Versatility in Information Retrieval:** TF-IDF is widely used in information retrieval systems. It helps rank documents based on their relevance to a query by emphasizing terms that are both frequent in the document and relatively rare in the entire corpus.

5. **Normalization of Term Frequencies:** TF-IDF normalizes term frequencies by taking into account the inverse document frequency, addressing issues associated with the varying lengths of documents and ensuring that common terms across the entire corpus don't dominate the representation.

**Disadvantages of TF-IDF (Term Frequency-Inverse Document Frequency):**

1. **Lack of Semantic Understanding:** Like Bag-of-Words, TF-IDF doesn't capture the semantic relationships between words. It treats words independently, which can be a limitation for tasks requiring a deeper understanding of language semantics.

2. **Loss of Word Order:** Similar to Bag-of-Words, TF-IDF discards word order information, treating each document as an unordered set of terms. This limits its effectiveness in tasks where word order and context are crucial.

3. **Sparse Representations:** TF-IDF vectors can still be sparse, especially in the presence of a large vocabulary. This sparsity may pose challenges in terms of computational efficiency and memory requirements.

4. **Sensitivity to Stopwords:** TF-IDF is sensitive to stopwords, which are common words that may not contribute much to the meaning of a document. While stopwords can be filtered out, the choice of which words to consider as stopwords can be subjective.

5. **Difficulty Handling Synonyms:** TF-IDF may struggle to handle synonyms effectively since it relies on exact term matches. Synonyms might be treated as distinct terms, and the model may not inherently capture their semantic equivalence.

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
tfidf = TfidfVectorizer()

In [61]:
text_tf = df["text"].tolist()
print(text_tf[:20])

['understand would like assist would need get private secure link assist', 'sprintcare propose', 'sprintcare send several private message one respond usual', "please send we private message assist click ' message ' top profile", 'sprintcare', 'please send we private message gain detail account', 'sprintcare bad customer service', 'saddening hear please shoot we dm look kc', 'sprintcare go to magically change connectivity whole family lyingface hundredpoint', 'understand concern we d like please send we direct message assist aa', 'sprintcare since sign yousince day', 'h we d definitely like work long experience issue aa', '\' lie " great " connection bar lte still \' load something smh', 'please send private message send link access account fr', 'whenever contact customer support tell shortcode enable account never year try', 'information incorrect jk', 'askspectrum would like email copy one since spectrum update training', 'department part corporate office you re particular area go for

In [62]:
text_tf = tfidf.fit_transform(text_tf)
text_tf

<100000x48605 sparse matrix of type '<class 'numpy.float64'>'
	with 998819 stored elements in Compressed Sparse Row format>

In [63]:
text_tf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [64]:
tfidf.idf_

array([ 7.46307946, 11.81978828, 11.81978828, ..., 11.41432318,
       11.81978828, 11.81978828])