In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load CSV Data
df = pd.read_csv("/content/Sample data.csv")

# Preprocess Text Data
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to all columns
for col in df.columns:
    df[col] = df[col].apply(preprocess_text)

# Concatenate all text columns into one
# Concatenate all text columns into one
df['all_text'] = df.astype(str).apply(' '.join, axis=1)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['all_text'])

# Define the search similar words function
def search_similar_words(input_word, top_n=5):
    input_word_vector = tfidf_vectorizer.transform([input_word])
    cosine_similarities = cosine_similarity(input_word_vector, tfidf_matrix).flatten()
    related_indices = cosine_similarities.argsort()[::-1][:top_n]
    return df.iloc[related_indices]

# Example usage
similar_words_df = search_similar_words("Urbanic")
print(similar_words_df)


   ï»¿Respondent ID Collector ID       Start Date         End Date  \
91      13257735481    413955084  20220113 191020  20220113 192457   
56      13257291158    413905455  20220113 142919  20220113 154227   
7       13257059830    413905455  20220113 105722  20220113 111905   
50      13257398803    413905455  20220113 154500  20220113 162429   
72      13257228982    413905455  20220113 134051  20220113 134734   

      IP Address Email Address First Name Last Name Custom Data 1  \
91     493687176           nan        nan       nan           nan   
56   11799169203           nan        nan       nan           nan   
7    11799169203           nan        nan       nan           nan   
50   11799169203           nan        nan       nan           nan   
72  122166119123           nan        nan       nan           nan   

   When was the last time you ordered apparel online?  ...  \
91                               in the last 6 months  ...   
56                               in the 

  df['all_text'] = df.astype(str).apply(' '.join, axis=1)


**important code**

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load CSV Data
df = pd.read_csv("/content/Sample data.csv")

# Preprocess Text Data
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to all columns
for col in df.columns:
    df[col] = df[col].apply(preprocess_text)

# Concatenate all text columns into one
df['all_text'] = pd.concat([df[col].astype(str) for col in df.columns], axis=1).apply(lambda row: ' '.join(row), axis=1)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['all_text'])

# Define the search similar words function
def search_similar_words(input_word):
    input_word_vector = tfidf_vectorizer.transform([input_word])
    cosine_similarities = cosine_similarity(input_word_vector, tfidf_matrix).flatten()
    related_indices = cosine_similarities.argsort()[::-1]
    similar_rows = df.iloc[related_indices]
    return similar_rows[cosine_similarities > 0]  # Return all matches

# Example usage
similar_words_df = search_similar_words("male")
print(similar_words_df)


   ï»¿Respondent ID Collector ID       Start Date         End Date  \
8       13257054510    413905455  20220113 105145  20220113 111428   
65      13257254534    413905455  20220113 135918  20220113 142033   
71      13257228073    413905455  20220113 134005  20220113 135004   
73      13257218552    413905455  20220113 133213  20220113 134110   
98      13259764922    413956607  20220114 140826  20220114 141941   
28      13254635013    413905455  20220112 150210  20220112 150839   
67      13257241476    413905455  20220113 135129  20220113 140940   
35      13254475727    413905455  20220112 130805  20220112 132737   
69      13257233604    413905455  20220113 134448  20220113 140034   
15      13254821748    413905455  20220112 170000  20220112 170656   
51      13257443454    413905455  20220113 161546  20220113 162106   
79      13257185311    413905455  20220113 130238  20220113 130851   
68      13257245385    413905455  20220113 135439  20220113 140116   
37      13254428638 

  df['all_text'] = pd.concat([df[col].astype(str) for col in df.columns], axis=1).apply(lambda row: ' '.join(row), axis=1)


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

# TF-IDF Vectorization with optimized parameters
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(df['all_text'])

# Dimensionality reduction with Truncated SVD
svd = TruncatedSVD(n_components=100)
tfidf_matrix_svd = svd.fit_transform(tfidf_matrix)

# Approximate Nearest Neighbors search
ann_model = NearestNeighbors(n_neighbors=100, algorithm='auto')
ann_model.fit(tfidf_matrix_svd)

def search_similar_words(input_word):
    input_word_vector = tfidf_vectorizer.transform([input_word])
    input_word_vector_svd = svd.transform(input_word_vector)
    distances, indices = ann_model.kneighbors(input_word_vector_svd)
    all_similar_rows = df.iloc[indices.flatten()]
    # Filter rows with non-zero cosine similarity
    similar_rows = all_similar_rows[distances.flatten() > 0]
    return similar_rows


# Example usage
similar_words_df = search_similar_words("female")
print(similar_words_df)


   ï»¿Respondent ID Collector ID       Start Date         End Date  \
84      13258103821    413956607  20220113 214549  20220113 214756   
63      13257277703    413905455  20220113 141930  20220113 142353   
38      13254416448    413905455  20220112 121710  20220112 122631   
2       13257116601    413905455  20220113 115757  20220113 120429   
99      13259759044    413905455  20220114 140318  20220114 141125   
..              ...          ...              ...              ...   
18      13254797080    413905455  20220112 164628  20220112 165705   
73      13257218552    413905455  20220113 133213  20220113 134110   
11      13254869551    413905455  20220112 172536  20220112 172938   
89      13257688706    413955084  20220113 184738  20220113 193729   
83      13258113170    413956607  20220113 214927  20220113 215154   

     IP Address Email Address First Name Last Name Custom Data 1  \
84  12217724739           nan        nan       nan           nan   
63    493612757        

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

# Load CSV Data
df = pd.read_csv("/content/Sample data.csv")

# Preprocess Text Data
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to all columns
for col in df.columns:
    df[col] = df[col].apply(preprocess_text)

# Concatenate all text columns into one
df['all_text'] = pd.concat([df[col].astype(str) for col in df.columns], axis=1).apply(lambda row: ' '.join(row), axis=1)

# TF-IDF Vectorization with optimized parameters
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(df['all_text'])

# Dimensionality reduction with Truncated SVD
svd = TruncatedSVD(n_components=100)
tfidf_matrix_svd = svd.fit_transform(tfidf_matrix)

# Approximate Nearest Neighbors search
ann_model = NearestNeighbors(n_neighbors=len(df), algorithm='auto')
ann_model.fit(tfidf_matrix_svd)

def search_similar_words(input_word):
    input_word_vector = tfidf_vectorizer.transform([input_word])
    input_word_vector_svd = svd.transform(input_word_vector)
    distances, indices = ann_model.kneighbors(input_word_vector_svd)
    all_similar_rows = df.iloc[indices.flatten()]
    # Filter rows with non-zero cosine similarity
    similar_rows = all_similar_rows[distances.flatten() > 0]
    return similar_rows

# Example usage
similar_words_df = search_similar_words("13257387743")
print(similar_words_df)


  df['all_text'] = pd.concat([df[col].astype(str) for col in df.columns], axis=1).apply(lambda row: ' '.join(row), axis=1)


   ï»¿Respondent ID Collector ID       Start Date         End Date  \
27      13254646588    413905455  20220112 150935  20220112 152255   
22      13254699723    413905455  20220112 154513  20220112 155400   
52      13257412998    413905455  20220113 155434  20220113 161142   
41      13254298362    413905455  20220112 103851  20220112 105148   
55      13257387743    413905455  20220113 153740  20220113 154641   
..              ...          ...              ...              ...   
88      13257771019    413956607  20220113 192724  20220113 194728   
33      13254501520    413905455  20220112 132941  20220112 133649   
4       13257080485    413905455  20220113 111949  20220113 114121   
58      13257347719    413905455  20220113 150857  20220113 151726   
83      13258113170    413956607  20220113 214927  20220113 215154   

      IP Address Email Address First Name Last Name Custom Data 1  \
27  122161252193           nan        nan       nan           nan   
22  122161252193     

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load CSV Data
df = pd.read_csv("/content/Sample data.csv")

# Preprocess Text Data
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to all columns
for col in df.columns:
    df[col] = df[col].apply(preprocess_text)

# Concatenate all text columns into one
df['all_text'] = pd.concat([df[col].astype(str) for col in df.columns], axis=1).apply(lambda row: ' '.join(row), axis=1)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(df['all_text'])

# Approximate Nearest Neighbors search with a different algorithm
ann_model = NearestNeighbors(n_neighbors=len(df), algorithm='auto')
ann_model.fit(tfidf_matrix)

def search_similar_words(input_word):
    input_word_vector = tfidf_vectorizer.transform([input_word])
    distances, indices = ann_model.kneighbors(input_word_vector)
    all_similar_rows = df.iloc[indices.flatten()]
    # Filter rows with non-zero cosine similarity
    similar_rows = all_similar_rows[distances.flatten() > 0]
    return similar_rows

# Example usage
similar_words_df = search_similar_words("zara")
print(similar_words_df)


   ï»¿Respondent ID Collector ID       Start Date         End Date  \
66      13257250421    413905455  20220113 135828  20220113 141022   
1       13257124027    413905455  20220113 120549  20220113 121649   
76      13257181321    413905455  20220113 125904  20220113 133652   
81      13257149831    413905455  20220113 123043  20220113 124219   
0       13257101478    413905455  20220113 114214  20220113 122227   
..              ...          ...              ...              ...   
59      13257291711    413905455  20220113 142911  20220113 144622   
58      13257347719    413905455  20220113 150857  20220113 151726   
33      13254501520    413905455  20220112 132941  20220112 133649   
88      13257771019    413956607  20220113 192724  20220113 194728   
30      13254577828    413905455  20220112 142505  20220112 144629   

      IP Address Email Address First Name Last Name Custom Data 1  \
66    1573718052           nan        nan       nan           nan   
1     1573718052     

  df['all_text'] = pd.concat([df[col].astype(str) for col in df.columns], axis=1).apply(lambda row: ' '.join(row), axis=1)


In [None]:
!pip install pyspellchecker


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [None]:
import pandas as pd

# Load your CSV file
df = pd.read_csv('/content/Sample data.csv')

# Define your keyword
keyword = 'zara'

# Use the apply function to check each cell for the keyword
mask = df.applymap(lambda x: keyword.lower() in str(x).lower())

# Get the rows where the keyword is found
result = df[mask.any(axis=1)]

# Print the result
print(result)


    ï»¿Respondent ID  Collector ID           Start Date             End Date  \
0        13257101478     413905455  2022-01-13 11:42:14  2022-01-13 12:22:27   
1        13257124027     413905455  2022-01-13 12:05:49  2022-01-13 12:16:49   
4        13257080485     413905455  2022-01-13 11:19:49  2022-01-13 11:41:21   
7        13257059830     413905455  2022-01-13 10:57:22  2022-01-13 11:19:05   
8        13257054510     413905455  2022-01-13 10:51:45  2022-01-13 11:14:28   
25       13254668858     413905455  2022-01-12 15:24:48  2022-01-12 15:31:47   
27       13254646588     413905455  2022-01-12 15:09:35  2022-01-12 15:22:55   
40       13254307047     413905455  2022-01-12 10:46:07  2022-01-12 10:57:47   
50       13257398803     413905455  2022-01-13 15:45:00  2022-01-13 16:24:29   
56       13257291158     413905455  2022-01-13 14:29:19  2022-01-13 15:42:27   
66       13257250421     413905455  2022-01-13 13:58:28  2022-01-13 14:10:22   
75       13257215194     413905455  2022

In [None]:
/content/Sample data.csv

In [None]:
!pip install transformers




In [None]:
import pandas as pd
from transformers import BertForMaskedLM, BertTokenizer

# Load your CSV file
df = pd.read_csv('/content/Sample data.csv')

# Define your keyword
keyword = 'frui'

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Prepare the inputs for the model
inputs = tokenizer.encode(f'{keyword} {tokenizer.mask_token}', return_tensors='pt')

# Get the prediction from the model
prediction = model(inputs)[0]

# Get the index of the masked token
masked_index = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

# Get the top 5 predictions
top_5_predictions = prediction[0, masked_index].topk(5).indices.tolist()

# Get the corrected keyword
corrected_keyword = tokenizer.decode(top_5_predictions[0])

# Use the apply function to check each cell for the keyword
mask = df.applymap(lambda x: corrected_keyword.lower() in str(x).lower())

# Get the rows where the keyword is found
result = df[mask.any(axis=1)]

# Print the result
print(result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


IndexError: index 103 is out of bounds for dimension 1 with size 5