Master Thesis

Find data. And load them.

In [6]:
### Import Data

import pandas as pd

file_path = "C:\\Users\\sujka\\Documents\\Master_Thesis\\Files\\amazon_reviews_us_Video_Games_v1_00.tsv"
df = pd.read_csv(file_path, delimiter='\t', on_bad_lines='skip')
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,12039526,RTIS3L2M1F5SM,B001CXYMFS,737716809,Thrustmaster T-Flight Hotas X Flight Stick,Video Games,5,0,0,N,Y,an amazing joystick. I especially love that yo...,"Used this for Elite Dangerous on my mac, an am...",2015-08-31
1,US,9636577,R1ZV7R40OLHKD,B00M920ND6,569686175,Tonsee 6 buttons Wireless Optical Silent Gamin...,Video Games,5,0,0,N,Y,Definitely a silent mouse... Not a single clic...,"Loved it, I didn't even realise it was a gami...",2015-08-31
2,US,2331478,R3BH071QLH8QMC,B0029CSOD2,98937668,Hidden Mysteries: Titanic Secrets of the Fatef...,Video Games,1,0,1,N,Y,One Star,poor quality work and not as it is advertised.,2015-08-31
3,US,52495923,R127K9NTSXA2YH,B00GOOSV98,23143350,GelTabz Performance Thumb Grips - PlayStation ...,Video Games,3,0,0,N,Y,"good, but could be bettee","nice, but tend to slip away from stick in inte...",2015-08-31
4,US,14533949,R32ZWUXDJPW27Q,B00Y074JOM,821342511,Zero Suit Samus amiibo - Japan Import (Super S...,Video Games,4,0,0,N,Y,Great but flawed.,"Great amiibo, great for collecting. Quality ma...",2015-08-31


In [9]:
# the dataset is too large and it takes alot to do simple tasks lets check which product has the most reviews and select those for our purposes

# Group by 'product_id' and count the number of reviews
review_counts = df.groupby('product_id').size().reset_index(name='review_count')

# Sort the DataFrame by 'review_count' in descending order
sorted_review_counts = review_counts.sort_values(by='review_count', ascending=False)


sorted_review_counts.head(30)

sorted_review_counts = sorted_review_counts.iloc[:30]

merged_df = pd.merge(df, sorted_review_counts, on='product_id', how='inner')
merged_df.groupby('product_title').size().head(30)

                                         review_body  \
0  Used this for Elite Dangerous on my mac, an am...   
1  Loved it,  I didn't even realise it was a gami...   
2     poor quality work and not as it is advertised.   
3  nice, but tend to slip away from stick in inte...   
4  Great amiibo, great for collecting. Quality ma...   

                                      cleaned_review  
0  used this for elite dangerous on my mac an ama...  
1  loved it  i didnt even realise it was a gaming...  
2      poor quality work and not as it is advertised  
3  nice but tend to slip away from stick in inten...  
4  great amiibo great for collecting quality mate...  


In [None]:
# text cleaning

import re

# Function to clean text
def clean_text(text):
    if isinstance(text, str): 
        text = text.lower() 
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
        return text
    return ""  # Return an empty string for non-string values (e.g., NaN)

# Apply the cleaning function to the 'review_body' column
merged_df['cleaned_review'] = merged_df['review_body'].apply(clean_text)

# Display the first few rows of the DataFrame
print(merged_df[['review_body', 'cleaned_review']].head())


Unnamed: 0,product_id,review_count
52439,B00BGA9WK2,10318
47229,B007FTE2VW,3971
27497,B00178630A,3715
43186,B0050SYILE,3545
44216,B005CPGHAA,3399


In [21]:
# tokenization

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

merged_df['tokens'] = merged_df['cleaned_review'].apply(tokenize_text)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sujka\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sujka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
# identify adjective-noun pairs
import spacy
nlp = spacy.load('en_core_web_sm')

def extract_adj_noun_pairs(text):
    doc = nlp(text)
    pairs = [(token.text, token.nbor().text) for token in doc if token.pos_ == 'ADJ' 
             and token.i+1 < len(doc) and doc[token.i+1].pos_ == 'NOUN']
    return pairs

merged_df['adj_noun_pairs'] = merged_df['cleaned_review'].apply(extract_adj_noun_pairs)

In [26]:
merged_df.head(10)



Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,cleaned_review,review_count,tokens,adj_noun_pairs
0,US,669612,R2MTHHQM6RSDQK,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,4,0,1,N,N,Four Stars,"I enjoyed th game, But it was easy to beat.",2015-08-31,i enjoyed th game but it was easy to beat,1932,"[enjoyed, th, game, easy, beat]",[]
1,US,44025365,R1R0CI93K1AEEP,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,5,0,1,N,Y,Five Stars,Everything was great,2015-08-25,everything was great,1932,"[everything, great]",[]
2,US,32104945,R1JVAGNQK31BXC,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,5,0,1,N,Y,Great game!,Game came perfect and quick. No major complaints.,2015-08-22,game came perfect and quick no major complaints,1932,"[game, came, perfect, quick, major, complaints]","[(major, complaints)]"
3,US,2454236,R3J7RKKVU8V73S,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,5,0,0,N,Y,Five stars,Five stars. This is the best used copy of any ...,2015-08-13,five stars this is the best used copy of any g...,1932,"[five, stars, best, used, copy, game, ever, re...",[]
4,US,21830745,RK4QTWPLSX37U,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,5,0,1,N,Y,Five Stars,great game,2015-08-10,great game,1932,"[great, game]","[(great, game)]"
5,US,27285315,R18ZO9Q9SN54FF,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,5,0,0,N,Y,Five Stars,this old grandma loves her call of duty,2015-08-10,this old grandma loves her call of duty,1932,"[old, grandma, loves, call, duty]","[(old, grandma)]"
6,US,45101931,R2L59BM4ZBKWRL,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,5,0,1,N,Y,Five Stars,great,2015-08-09,great,1932,[great],[]
7,US,43198408,RVPEMCJKO9J0Z,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,5,0,0,N,Y,Five Stars,Worked good,2015-08-06,worked good,1932,"[worked, good]",[]
8,US,49052566,R3AF7MNOTBW3YI,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,5,0,0,N,Y,Five Stars,A+,2015-07-30,a,1932,[],[]
9,US,13038564,R279SSA7XL6ECL,B00503E8S2,895635946,Call of Duty: Modern Warfare 3 - Xbox 360,Video Games,3,0,0,N,Y,"As a secret shopper, this is my review.",Haven't played all the way through the game ye...,2015-07-30,havent played all the way through the game yet...,1932,"[havent, played, way, game, yet, far, good, pr...",[]


In [27]:
# Save the Cleaned Data to a CSV file
merged_df.to_csv('cleaned_data_amazon_reviews.csv', index=False)