In [5]:
import pandas as pd
import csv
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer


In [1]:
dataset_path = "/home/samuel/NYU/BDS/project/review_dataset/"

def read_tsv_file(file_path):
    try:
        data = []
        with open(file_path, 'r', newline='', encoding='utf-8') as file:
            # Using the CSV reader with the tab delimiter
            reader = csv.reader(file, delimiter='\t')

            # Reading headers
            headers = next(reader)
            print("Headers:", headers)

            # Reading data rows
            for row in reader:
                data.append(row)

        # Create a Pandas DataFrame using the headers and data
        df = pd.DataFrame(data, columns=headers)
        return df

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
    except Exception as e:
        print(f"An error occurred: {e}")
file_path = dataset_path + 'drugsComTrain_raw.tsv'
dataset = read_tsv_file(file_path)

Headers: ['', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount']


In [4]:
def process_dataset_df(dataset):
    processed_reviews = []

    for index, row in dataset.iterrows():
        if index % 50000 == 0:
            print(index)
        # Assuming the review text is in a column named 'review'
        review_text = row['review']

        # HTML decoding
        text_html = BeautifulSoup(review_text, 'html.parser').get_text()

        # Tokenization and removal of stop words
        stop_words = set(stopwords.words('english'))
        words = [word.lower() for word in text_html.split() if word.lower() not in stop_words]

        # Stemming
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(word) for word in words]

        # Join the processed words back into a single string
        processed_text = ' '.join(stemmed_words)

        processed_reviews.append(processed_text)

    # Add a new column 'processed_review' to the DataFrame
    dataset['processed_review'] = processed_reviews

    return dataset
processed_dataset = process_dataset_df(dataset)  

0


  text_html = BeautifulSoup(review_text, 'html.parser').get_text()


50000
100000
150000


In [30]:
drug_names = processed_dataset['drugName'].value_counts()

# Create an empty DataFrame to store TF-IDF results for each drug name
tfidf_results = pd.DataFrame()
tfidf_results_dict = {}

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=100)

# Iterate over each unique drug name and compute TF-IDF separately
for drug_name in drug_names.iloc[0:10].index:
    # Filter the dataset for the current drug name
    drug_dataset = processed_dataset[processed_dataset['drugName'] == drug_name]
    
    # Compute TF-IDF for the 'processed_review' column of the filtered dataset
    tfidf_matrix = vectorizer.fit_transform(drug_dataset['processed_review'])
    
    # Convert TF-IDF matrix to DataFrame and add drug name as a column
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    tfidf_df['drugName'] = drug_name
    
    # Append TF-IDF results for the current drug name to the overall results DataFrame
    tfidf_results_dict[drug_name] = tfidf_df
# Print the TF-IDF results
print("TF-IDF Results:")
print(tfidf_results_dict)


TF-IDF Results:
{'Levonorgestrel':             10       ago    almost      also  back       bad     birth  \
0     0.000000  0.000000  0.000000  0.000000   0.0  0.000000  0.000000   
1     0.000000  0.181273  0.410771  0.000000   0.0  0.000000  0.000000   
2     0.000000  0.000000  0.000000  0.000000   0.0  0.000000  0.000000   
3     0.279302  0.000000  0.284351  0.000000   0.0  0.000000  0.229721   
4     0.213656  0.191981  0.000000  0.395117   0.0  0.171630  0.000000   
...        ...       ...       ...       ...   ...       ...       ...   
3652  0.000000  0.000000  0.000000  0.000000   0.0  0.000000  0.273556   
3653  0.000000  0.000000  0.000000  0.199731   0.0  0.000000  0.000000   
3654  0.000000  0.000000  0.000000  0.000000   0.0  0.000000  0.000000   
3655  0.164207  0.147549  0.000000  0.303671   0.0  0.000000  0.135057   
3656  0.174998  0.000000  0.178161  0.000000   0.0  0.140576  0.000000   

         bleed  boyfriend     came  ...       use      want      week  \
0  

In [31]:
tfidf_results_dict["Levonorgestrel"]

Unnamed: 0,10,ago,almost,also,back,bad,birth,bleed,boyfriend,came,...,use,want,week,weight,went,work,worri,would,year,drugName
0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,...,0.000000,0.000000,0.206600,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,Levonorgestrel
1,0.000000,0.181273,0.410771,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,...,0.000000,0.194217,0.000000,0.000000,0.000000,0.000000,0.0,0.161105,0.471408,Levonorgestrel
2,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,Levonorgestrel
3,0.279302,0.000000,0.284351,0.000000,0.0,0.000000,0.229721,0.000000,0.0,0.00000,...,0.000000,0.268888,0.000000,0.267086,0.000000,0.000000,0.0,0.000000,0.000000,Levonorgestrel
4,0.213656,0.191981,0.000000,0.395117,0.0,0.171630,0.000000,0.000000,0.0,0.00000,...,0.000000,0.000000,0.135828,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,Levonorgestrel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3652,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.273556,0.000000,0.0,0.00000,...,0.000000,0.000000,0.000000,0.318051,0.000000,0.000000,0.0,0.000000,0.000000,Levonorgestrel
3653,0.000000,0.000000,0.000000,0.199731,0.0,0.000000,0.000000,0.000000,0.0,0.21068,...,0.000000,0.000000,0.137322,0.000000,0.000000,0.337452,0.0,0.000000,0.000000,Levonorgestrel
3654,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,...,0.000000,0.204766,0.000000,0.000000,0.367012,0.000000,0.0,0.000000,0.165671,Levonorgestrel
3655,0.164207,0.147549,0.000000,0.303671,0.0,0.000000,0.135057,0.136245,0.0,0.00000,...,0.150337,0.000000,0.000000,0.157025,0.000000,0.000000,0.0,0.000000,0.000000,Levonorgestrel
