In [None]:
  from google.colab import drive
  drive.mount('/content/drive')
  shared_folder_path = '/content/drive/My Drive/Colab Notebooks/'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


!pip install swifter
!pip install dask[dataframe]
!pip install dask
!pip install spacy
!python -m spacy download en_core_web_sm



In [None]:
#this cell is going to contain all necessary libraries
import nltk
import re
import string
import pandas as pd
import glob
import ast   # to parse stringified lists
import os
#import dask.dataframe as dd
import spacy

In [None]:
#the following just download the necessary packages
# punkt splits a text into words or sentences
nltk.download("punkt")
nltk.download("punkt_tab")
#to remove stopwords
nltk.download("stopwords")
#wordnet maps words to their base form (lemmatization)
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
#to have access to the necessary functions

from nltk.tokenize import word_tokenize    #to slit the textx into tokens
from nltk.corpus import stopwords           #eliminates the dtopwords
from nltk.stem import WordNetLemmatizer     #to lemmatize
from nltk.tokenize import sent_tokenize     #to split the sentences from eachother
from collections import Counter
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk import pos_tag                 #to be able to tag part-of-speech
from nltk.corpus import wordnet
from dask import delayed
nlp = spacy.load("en_core_web_sm")



In [None]:
#english stopwords
stop_words = set(stopwords.words('english'))


In [None]:
!ls "/content/drive/My Drive/Colab Notebooks/"


In [None]:

train_file = '/content/drive/My Drive/Colab Notebooks/train_data.csv'

train_df = pd.read_csv(train_file)



In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
#first the train dataset should be cleaned
#the first step: lower case everything and converting everything in the review column into strings
train_df["Review"]=train_df["Review"].str.lower()

In [None]:
train_df["Review"].head()

In [None]:
#there might be duplicates in the train data that need to be handled
train_duplicates = train_df.duplicated(subset=['Review']).sum()



In [None]:
print(train_duplicates)

In [None]:
#to eliminate the duplicates and just keep the first one
train_df = train_df.drop_duplicates(subset=['Review'], keep='first').reset_index(drop=True)


In [None]:
train_df["Label"].value_counts()

In [None]:
#for the rest, a function would help, the function should do everything
def preprocess_text(text):
      text = re.sub(r'[^a-z\s]', '', text)      #to remove whatever that is not strings from a to z, however, keeps the spaces
      text = text.translate(str.maketrans('', '', string.punctuation))   #to remove punctuationa
      text = ' '.join(word for word in text.split() if word not in stop_words)  #to remove the stopwords
      text = re.sub(r'\s+', ' ', text).strip()      #to remove extra whitespace
      #remove HTML
      text=re.sub(r'<.*?>', '', text)
      #remove URLs
      text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
      #remove tags
      text=re.sub(r'#\S+','',text)

      return text


In [None]:
#the following adds a new column containing the clean reviews to our dataframe
train_df["clean_review"]=train_df["Review"].apply(preprocess_text)

In [None]:
train_df.columns

In [None]:
train_df.head()

In [None]:
#since some project have also eliminated frequent words, it is better to find the 10 most frequent words in the train_df
#and then decide whether they need to remain or be eliminated
cnt=Counter()
for text in train_df["clean_review"].values:
  for word in text.split():
    cnt[word]+=1

cnt.most_common(10)

In [None]:
#some words have nothing to do with sentiment analysis and can be removed
custom_stopwords={'book', 'one', 'would', 'get', 'read', 'time'}
stop_words.update(custom_stopwords)

In [None]:
#let us save the file as a csv
train_df.to_csv( "/content/drive/My Drive/Colab Notebooks/clean_review_train.csv", index=False)


In [None]:
!ls "/content/drive/My Drive/Colab Notebooks/"


In [None]:
#to check the clean_review_train, it is better that it gets reloaded into a dataframe

clean_train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/clean_review_train.csv")

print(clean_train.head())


In [None]:
#the next step is tokenizing the text, the function below will do that
def tokenize_text(text):
  return word_tokenize(text)

In [None]:
clean_train["clean_review"]=clean_train["clean_review"].astype(str)

In [None]:
#the size of the data is too large that is why it needs to be tokenized it in chunks
chunk_size = 5000



In [None]:
#it was impossible to do the tokenizytion due to RAM and the session would result in crashes so the dataframe had to
#turn into dask dataframes

# changing into a dask dataframe
ddf = dd.from_pandas(clean_train, npartitions=len(clean_train) // chunk_size)


In [None]:
print(ddf.head())
print(ddf.npartitions)  # Verify number of partitions which is 719


In [None]:
# tokenization to each partition
ddf["tokenized"] = ddf["clean_review"].map(tokenize_text, meta=('clean_review', 'object'))


In [None]:
print(ddf.head())
print(f"Number of rows: {len(ddf)}")



In [None]:
# Save the result directly to multiple CSV files, so we will have 719 CSV files
ddf.to_csv("processed_data_*.csv", index=False)


In [None]:

input_path = "/content/drive/MyDrive/Colab Notebooks/tokenized_data/"
output_path = "/content/drive/MyDrive/Colab Notebooks/lemmatized_data/"
os.makedirs(output_path, exist_ok=True)

In [None]:
#the part-of-speech tagging from the NLTK did not work on my system there was an error that could not be debugged
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


In [None]:
def lemmatize_tokens_with_pos(tokenized_text):

    try:
        # If tokenized_text is a stringified list, convert it to a list
        if isinstance(tokenized_text, str):
            tokens = ast.literal_eval(tokenized_text)
        else:
            tokens = tokenized_text

        # Perform POS tagging
        pos_tags = pos_tag(tokens)


        lemmatized_tokens = [
            lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags
        ]
        return lemmatized_tokens
    except Exception as e:
        print(f"Error in lemmatization: {e}")
        return []

In [None]:
def process_large_file(input_file, output_file, chunk_size=5000):

    try:

        with pd.read_csv(input_file, chunksize=chunk_size) as reader:
            for chunk_idx, chunk in enumerate(reader):
                print(f"Processing chunk {chunk_idx + 1} from file {input_file}...")

                chunk["lemmatized"] = chunk["tokenized"].apply(lemmatize_tokens_with_pos)

                mode = "w" if chunk_idx == 0 else "a"
                header = chunk_idx == 0
                chunk.to_csv(output_file, mode=mode, header=header, index=False)
                print(f"Chunk {chunk_idx + 1} saved to {output_file}.")
    except Exception as e:
        print(f"Error processing file {input_file}: {e}")


In [None]:
#from here, the code is not part of the main analysis, rather, the point is to check the result of the files
#in both tokenized and lemmatized files


print(os.getcwd())

/content/drive/MyDrive/Colab Notebooks/tokenized_data


In [None]:
os.chdir("/content/drive/MyDrive/Colab Notebooks/tokenized_data/")

In [None]:
#to see some lines of the tokenized train data
print(token_train["tokenized"].head(20))

0     ['twenty', 'thousand', 'really', 'disliked', '...
1     ['twenty', 'thousand', 'leagues', 'sea', 'twen...
2     ['fast', 'read', 'unique', 'lifetime', 'advent...
3     ['horrible', 'narration', 'narrator', 'europea...
4     ['buy', 'jerseyxray', 'paid', 'promptlyno', 'i...
5     ['urk', 'okay', 'maybe', 'shouldnt', 'mean', '...
6     ['twenty', 'thousand', 'leagues', 'sea', 'twen...
7     ['know', 'classic', 'literacy', 'wiill', 'neve...
8     ['trial', 'read', 'describe', 'much', 'enjoy',...
9     ['personally', 'enjoyed', 'book', 'tremendousl...
10    ['dull', 'story', 'agree', 'reviews', 'sense',...
11    ['bad', 'printing', 'choices', 'commenting', '...
12    ['like', 'book', 'book', 'leagues', 'sea', 're...
13    ['classics', 'greatobviously', 'read', 'loud',...
14    ['childrens', 'version', 'original', 'story', ...
15    ['free', 'good', 'first', 'kindle', 'ebook', '...
16    ['classic', 'novel', 'novel', 'fanciful', 'tal...
17    ['causes', 'kindle', 'freeze', 'book', 'ca

In [None]:
#now the lemmatize train data, a random file for example lemmatized_data_003 can be checked
direct_path="/content/drive/MyDrive/Colab Notebooks/lemmatized_data"
file_check_2='/content/drive/MyDrive/Colab Notebooks/lemmatized_data/lemmatized_data_003.csv'

os.makedirs(direct_path, exist_ok=True)


try:
  lemma_train=pd.read_csv(file_check_2)
  print("columns in the file:")
  print(lemma_train.columns.tolist())
except Exception as e:
    print(f"Error reading the file {file_check_2}: {e}")


columns in the file:
['Label', 'Review', 'clean_review', 'tokenized', 'lemmatized']


In [None]:
lemma_train.head()

Unnamed: 0,Label,Review,clean_review,tokenized,lemmatized
0,0,twenty thousand: i really disliked this book. ...,twenty thousand really disliked book think fac...,"['twenty', 'thousand', 'really', 'disliked', '...","['twenty', 'thousand', 'really', 'dislike', 'b..."
1,1,twenty thousand leagues under the sea: twenty ...,twenty thousand leagues sea twenty thousand le...,"['twenty', 'thousand', 'leagues', 'sea', 'twen...","['twenty', 'thousand', 'league', 'sea', 'twent..."
2,1,fast read of a unique lifetime adventure: if y...,fast read unique lifetime adventure ever wonde...,"['fast', 'read', 'unique', 'lifetime', 'advent...","['fast', 'read', 'unique', 'lifetime', 'advent..."
3,0,horrible narration: the narrator should have a...,horrible narration narrator european french ac...,"['horrible', 'narration', 'narrator', 'europea...","['horrible', 'narration', 'narrator', 'europea..."
4,0,"do not buy from ""jersey.xray""....: i paid prom...",buy jerseyxray paid promptlyno itemi wrote the...,"['buy', 'jerseyxray', 'paid', 'promptlyno', 'i...","['buy', 'jerseyxray', 'pay', 'promptlyno', 'it..."


In [None]:
lemma_train.dtypes

Unnamed: 0,0
Label,int64
Review,object
clean_review,object
tokenized,object
lemmatized,object


In [None]:


folder_path = "/content/drive/MyDrive/Colab Notebooks/lemmatized_data" # Replace 'YourFolderName' with your folder's name

output_file = "/content/drive/MyDrive/Colab Notebooks/combined_lemmatized_traindata"

combined_data = pd.DataFrame()

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    if filename.endswith(".csv"):
        data = pd.read_csv(file_path, usecols=['Label', 'lemmatized'])

        combined_data = pd.concat([combined_data, data], ignore_index=True)

combined_data.to_csv(output_file, index=False, header=True)
print(f"Combined file saved to: {output_file}")


Combined file saved to: /content/drive/MyDrive/Colab Notebooks/combined_lemmatized_traindata


In [None]:
data.head()

Unnamed: 0,Label,lemmatized
0,0,"['comfort', 'crappy', 'quality', 'purchase', '..."
1,1,"['intex', 'supreme', 'airflow', 'queen', 'airb..."
2,0,"['extremely', 'disappointed', 'purchase', 'air..."
3,0,"['long', 'term', 'use', 'sleep', 'every', 'nig..."
4,1,"['good', 'value', 'great', 'bed', 'bed', 'comf..."


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5006 entries, 0 to 5005
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Label       5006 non-null   int64 
 1   lemmatized  5006 non-null   object
dtypes: int64(1), object(1)
memory usage: 78.3+ KB


In [None]:
data.isnull().sum()

Unnamed: 0,0
Label,0
lemmatized,0


In [None]:
data['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,2519
0,2487


In [None]:
data.duplicated().sum()

0

In [None]:
data.