In [1]:
import requests
from bs4 import BeautifulSoup

import re
import pandas as pd

import numpy as np
import lucem_illud #pip install -U git+https://github.com/UChicago-Computational-Content-Analysis/lucem_illud.git
import datetime

In [2]:
def get_reviews(appid, params={'json': 1}):
    url = 'https://store.steampowered.com/appreviews/'
    response = requests.get(url=url + str(appid), params=params, headers={'User-Agent': 'Mozilla/5.0'})
    return response.json()

def get_n_reviews(appid, n=100):
    reviews = []
    cursor = '*'
    params = {
        'json': 1,
        'filter': 'all',
        'language': 'english',
        'day_range': 9223372036854775807,
        'review_type': 'all',
        'purchase_type': 'all'
    }

    for _ in range(5):
        params['cursor'] = cursor.encode()
        params['num_per_page'] = min(100, n)
        n -= 100

        response = get_reviews(appid, params)
        cursor = response['cursor']
        reviews += [review['review'] for review in response['reviews']]

        if len(response['reviews']) < 1000:
            break

    return reviews

In [4]:
def get_game_tags(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tags_container = soup.find('div', class_='glance_tags popular_tags')
    if tags_container:
        tags = tags_container.text.strip().split('\n')
        tags = [tag.strip() for tag in tags]
        return tags
    else:
        return []

def get_reviews(appid, params={'json': 1}):
    url = 'https://store.steampowered.com/appreviews/'
    response = requests.get(url=url + str(appid), params=params, headers={'User-Agent': 'Mozilla/5.0'})
    return response.json()

def get_n_reviews_with_tags(appid, n=5):
    reviews = []
    cursor = '*'
    params = {
        'json': 1,
        'filter': 'all',
        'language': 'english',
        'day_range': 9223372036854775807,
        'review_type': 'all',
        'purchase_type': 'all'
    }

    while n > 0:
        params['cursor'] = cursor.encode()
        params['num_per_page'] = min(100, n)
        n -= 100

        response = get_reviews(appid, params)
        cursor = response['cursor']
        for review in response['reviews']:
            review['tags'] = get_game_tags(f"https://store.steampowered.com/app/{appid}")
            reviews.append(review)

        if len(response['reviews']) < 100:
            break

    return reviews

# Function to get reviews with tags
def get_game_reviews_df_with_tags(app_id_to_game_name_mapping):
    reviews_data = []

    for appid, game_name in app_id_to_game_name_mapping.items():
        reviews = get_n_reviews_with_tags(appid, n=1000)  # Limit to the first 1000 reviews

        # Extract individual reviews, dates, and tags
        individual_reviews = [review['review'] for review in reviews]
        review_dates = [datetime.datetime.fromtimestamp(review['timestamp_created']).strftime('%Y-%m-%d') for review in reviews]
        review_tags = [review['tags'] for review in reviews]

        # Clean reviews
        cleaned_reviews = [re.sub(r'[\n\r\t]', ' ', review) for review in individual_reviews]

        # Split reviews into lists of words
        # tokenized_reviews = [review.split() for review in cleaned_reviews]

        # Create a DataFrame with exploded reviews, dates, and tags
        exploded_df = pd.DataFrame({
            'GameName': [game_name] * len(cleaned_reviews),
            'Reviews': cleaned_reviews,
            'Tags': review_tags,
            'Date': review_dates
        })
        # exploded_df = pd.DataFrame({
        #     'GameName': [game_name] * len(tokenized_reviews),
        #     'Reviews': tokenized_reviews,
        #     'Tags': review_tags,
        #     'Date': review_dates
        # })

        reviews_data.append(exploded_df)

    df = pd.concat(reviews_data, ignore_index=True)
    return df

# Replace this with the actual list of links
steam_links = [
    'https://store.steampowered.com/app/1086940/Baldurs_Gate_3/',
    'https://store.steampowered.com/app/730/CounterStrike_2/',
    'https://store.steampowered.com/app/1966720/Lethal_Company/',
    'https://store.steampowered.com/app/1938090/Call_of_Duty/',
    'https://store.steampowered.com/app/1599340/Lost_Ark/',
    'https://store.steampowered.com/app/1085660/Destiny_2/',
    'https://store.steampowered.com/app/2140330/Madden_NFL_24/',
    'https://store.steampowered.com/app/582010/Monster_Hunter_World/',
    'https://store.steampowered.com/app/1623730/Palworld/',
    'https://store.steampowered.com/app/236390/War_Thunder/',
    'https://store.steampowered.com/app/289070/Sid_Meiers_Civilization_VI/',
    'https://store.steampowered.com/app/227300/Euro_Truck_Simulator_2/',
    'https://store.steampowered.com/app/105600/Terraria/',
    'https://store.steampowered.com/app/251570/7_Days_to_Die/',
    'https://store.steampowered.com/app/1551360/Forza_Horizon_5/',
    'https://store.steampowered.com/app/1145360/Hades/',
    'https://store.steampowered.com/app/413150/Stardew_Valley/',
    'https://store.steampowered.com/app/1973530/Limbus_Company/',
    'https://store.steampowered.com/app/1778820/TEKKEN_8/',
    'https://store.steampowered.com/app/620/Portal_2/',
    'https://store.steampowered.com/app/374320/DARK_SOULS_III/'
]

# Extract app ids and game names using regular expressions
app_id_to_game_name_mapping = {}
for link in steam_links:
    match = re.search(r'/app/(\d+)/([^/]+)/', link)
    if match:
        app_id = int(match.group(1))
        game_name = match.group(2).replace('_', ' ')
        app_id_to_game_name_mapping[app_id] = game_name

# Create DataFrame with tags
reviews_df = get_game_reviews_df_with_tags(app_id_to_game_name_mapping)

# reviews_df.to_csv('reviews.csv', index=False)

# Display the DataFrame
print(reviews_df)

             GameName                                            Reviews  \
0      Baldurs Gate 3  You know that time when you're playing D&D and...   
1      Baldurs Gate 3  You can convince bosses to kill themselves. 10/10   
2      Baldurs Gate 3  my work is really cutting into my Baldur's Gat...   
3      Baldurs Gate 3  Possessed a guard and had them open a gate con...   
4      Baldurs Gate 3  I have only one point of criticism: This game ...   
...               ...                                                ...   
20995  DARK SOULS III  this game iight not as good as Super Auto Pets...   
20996  DARK SOULS III  Compared to the first Dark Souls game, DS3 is ...   
20997  DARK SOULS III                             Better then elden ring   
20998  DARK SOULS III         Fear the naked invader with a giant sword.   
20999  DARK SOULS III  This is some of the most fun I’ve had playing ...   

                                                    Tags        Date  
0      [RPG, Cho

In [73]:
import chardet

# Detect the encoding of the file
with open('D:/UChicago/MACS 60000/Homework-Notebooks-2024-Winter-1/reviews_cleaned.csv', 'rb') as file:
    result = chardet.detect(file.read())  # Read the entire file; may be inefficient for large files

In [88]:
reviews_df = pd.read_csv('D:/UChicago/MACS 60000/Homework-Notebooks-2024-Winter-1/reviews_cleaned.csv')

# reviews_df = pd.read_csv('D:/UChicago/MACS 60000/Homework-Notebooks-2024-Winter-1/reviews_cleaned.csv', encoding=result['encoding'])

# result

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 179841: invalid start byte

In [6]:
reviews_df

Unnamed: 0,GameName,Reviews,Tags,Date
0,Baldurs Gate 3,You know that time when you're playing D&D and...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-08-20
1,Baldurs Gate 3,You can convince bosses to kill themselves. 10/10,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-12-18
2,Baldurs Gate 3,my work is really cutting into my Baldur's Gat...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-09-12
3,Baldurs Gate 3,Possessed a guard and had them open a gate con...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-08-11
4,Baldurs Gate 3,I have only one point of criticism: This game ...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-09-22
...,...,...,...,...
20995,DARK SOULS III,this game iight not as good as Super Auto Pets...,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2023-02-07
20996,DARK SOULS III,"Compared to the first Dark Souls game, DS3 is ...","[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2017-05-23
20997,DARK SOULS III,Better then elden ring,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2023-01-18
20998,DARK SOULS III,Fear the naked invader with a giant sword.,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2018-11-27


In [7]:
def clean_review(text):
    # Remove patterns like [/h1], [/h2], ..., [/hX] where X is any alphanumeric sequence
    text = re.sub(r'\[/?[a-zA-Z0-9]+\]', '', text)


    text = re.sub(r'[^\w\s]', '', text)
    return text

reviews_df['cleaned_reviews'] = reviews_df['Reviews'].apply(clean_review)

In [8]:
reviews_df['tokenized_reviews'] = reviews_df['cleaned_reviews'].apply(lambda x: [lucem_illud.word_tokenize(s) for s in lucem_illud.sent_tokenize(x)])

reviews_df['normalized_reviews'] = reviews_df['tokenized_reviews'].apply(lambda x: [lucem_illud.normalizeTokens(s, lemma=False) for s in x])



In [9]:
reviews_df['Reviews'][64]   

'Only 8 hours in and I\'m loving it.  BG3 fans, beware though, if you\'re expecting a remastered BG1/2 you will be sorely disappointed. This is Larian\'s version of Baldur\'s Gate and it is therefore very similar to Divinity.   If you like DnD5e you will LOVE this game. If you like any Divinity game you will LOVE this game. If you like BG1/2 you MIGHT love this game.  Edit after 490 hours more:  As much as the game is not entirely like the original Baldur\'s Gates, the amount of easter eggs and similarities (most of which because they\'re all part of the same canon continuity of the Forgotten Realm) make this game thoroughly enjoyable for OG fans of D&D, FR series and Baldur\'s Gate alike. So I would change the "MIGHT" for a "MOST LIKELY".   The replayability is top tier. There is so much content to be explored that on my most recent run I still found at least 3-4 entirely new cutscenes and interactions.   The playability is also top tier. While it might be hard to get into for people 

In [35]:
reviews_df['Reviews'][172]

'[h1][b][i]The game where happiness dies.[/i] 🙂[/b][/h1]'

In [36]:
reviews_df['normalized_reviews'][172]

[['game', 'happiness', 'dies']]

In [10]:
reviews_df["normalized_reviews"][64]

[['hours',
  'm',
  'loving',
  'bg3',
  'fans',
  'beware',
  'expecting',
  'remastered',
  'bg12',
  'sorely',
  'disappointed',
  'larians',
  'version',
  'baldurs',
  'gate',
  'similar',
  'divinity',
  'like',
  'dnd5e',
  'love',
  'game',
  'like',
  'divinity',
  'game',
  'love',
  'game',
  'like',
  'bg12',
  'love',
  'game',
  'edit',
  'hours',
  'game',
  'entirely',
  'like',
  'original',
  'baldurs',
  'gates',
  'easter',
  'eggs',
  'similarities',
  'canon',
  'continuity',
  'forgotten',
  'realm',
  'game',
  'thoroughly',
  'enjoyable',
  'og',
  'fans',
  'dd',
  'fr',
  'series',
  'baldurs',
  'gate'],
 ['alike'],
 ['change', 'likely'],
 ['replayability',
  'tier',
  'content',
  'explored',
  'recent',
  'run',
  'found',
  'entirely',
  'new',
  'cutscenes',
  'interactions',
  'playability',
  'tier',
  'hard',
  'people',
  'dd',
  'turnbased',
  'rpgs',
  'combat',
  'simple',
  'long',
  'willing',
  'read',
  'spells',
  'simply',
  'click',
  'stuf

In [11]:
reviews_df

Unnamed: 0,GameName,Reviews,Tags,Date,cleaned_reviews,tokenized_reviews,normalized_reviews
0,Baldurs Gate 3,You know that time when you're playing D&D and...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-08-20,You know that time when youre playing DD and y...,"[[You, know, that, time, when, you, re, playin...","[[know, time, playing, dd, think, man, cool, g..."
1,Baldurs Gate 3,You can convince bosses to kill themselves. 10/10,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-12-18,You can convince bosses to kill themselves 1010,"[[You, can, convince, bosses, to, kill, themse...","[[convince, bosses, kill]]"
2,Baldurs Gate 3,my work is really cutting into my Baldur's Gat...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-09-12,my work is really cutting into my Baldurs Gate...,"[[my, work, is, really, cutting, into, my, Bal...","[[work, cutting, baldurs, gate, time, quit]]"
3,Baldurs Gate 3,Possessed a guard and had them open a gate con...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-08-11,Possessed a guard and had them open a gate con...,"[[Possessed, a, guard, and, had, them, open, a...","[[possessed, guard, open, gate, containing, gi..."
4,Baldurs Gate 3,I have only one point of criticism: This game ...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-09-22,I have only one point of criticism This game l...,"[[I, have, only, one, point, of, criticism, Th...","[[point, criticism, game, lacks, proper, endin..."
...,...,...,...,...,...,...,...
20995,DARK SOULS III,this game iight not as good as Super Auto Pets...,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2023-02-07,this game iight not as good as Super Auto Pets...,"[[this, game, iight, not, as, good, as, Super,...","[[game, iight, good, super, auto, pets, vampir..."
20996,DARK SOULS III,"Compared to the first Dark Souls game, DS3 is ...","[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2017-05-23,Compared to the first Dark Souls game DS3 is y...,"[[Compared, to, the, first, Dark, Souls, game,...","[[compared, dark, souls, game, ds3, young, goo..."
20997,DARK SOULS III,Better then elden ring,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2023-01-18,Better then elden ring,"[[Better, then, elden, ring]]","[[better, elden, ring]]"
20998,DARK SOULS III,Fear the naked invader with a giant sword.,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2018-11-27,Fear the naked invader with a giant sword,"[[Fear, the, naked, invader, with, a, giant, s...","[[fear, naked, invader, giant, sword]]"


In [91]:
# make tags into lists

# reviews_df['Tags'] = reviews_df['Tags'].apply(lambda x: x.strip('][').split(', ') if x else [])

# aa=reviews_df['Tags'][0]
# aa=[x.strip("'") for x in aa]
# aa=pd.read_csv('reviews_cleaned.csv',converters={'Tags':eval}, encoding=result['encoding'])
# aa['Tags'][0]

reviews_df = pd.read_csv('reviews_cleaned.csv',converters={'Tags':eval}, encoding=result['encoding'])

In [12]:
# Define the list of genres
valid_genres = ['RPG', 'Adventure', 'FPS', 'Action', 'Horror', 'Sports', 'Strategy', 'MMO', 'Open World',
                'Puzzle', 'Rogue-like', 'Hack and Slash', 'Arcade', 'Fighter', 'Visual Novel', 'Sandbox',
                'Dating', 'Simulation', 'Racing', 'Survival', 'Platformer']

# Function to filter the tags based on valid genres
def filter_genres(tags):
    return [tag for tag in tags if tag in valid_genres]

# Apply the function to create the 'genre' column
reviews_df['genre'] = reviews_df['Tags'].apply(filter_genres)

In [13]:
reviews_df['genre']

0                  [RPG, Adventure, Strategy]
1                  [RPG, Adventure, Strategy]
2                  [RPG, Adventure, Strategy]
3                  [RPG, Adventure, Strategy]
4                  [RPG, Adventure, Strategy]
                         ...                 
20995    [RPG, Adventure, Action, Open World]
20996    [RPG, Adventure, Action, Open World]
20997    [RPG, Adventure, Action, Open World]
20998    [RPG, Adventure, Action, Open World]
20999    [RPG, Adventure, Action, Open World]
Name: genre, Length: 21000, dtype: object

In [14]:
reviews_df

Unnamed: 0,GameName,Reviews,Tags,Date,cleaned_reviews,tokenized_reviews,normalized_reviews,genre
0,Baldurs Gate 3,You know that time when you're playing D&D and...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-08-20,You know that time when youre playing DD and y...,"[[You, know, that, time, when, you, re, playin...","[[know, time, playing, dd, think, man, cool, g...","[RPG, Adventure, Strategy]"
1,Baldurs Gate 3,You can convince bosses to kill themselves. 10/10,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-12-18,You can convince bosses to kill themselves 1010,"[[You, can, convince, bosses, to, kill, themse...","[[convince, bosses, kill]]","[RPG, Adventure, Strategy]"
2,Baldurs Gate 3,my work is really cutting into my Baldur's Gat...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-09-12,my work is really cutting into my Baldurs Gate...,"[[my, work, is, really, cutting, into, my, Bal...","[[work, cutting, baldurs, gate, time, quit]]","[RPG, Adventure, Strategy]"
3,Baldurs Gate 3,Possessed a guard and had them open a gate con...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-08-11,Possessed a guard and had them open a gate con...,"[[Possessed, a, guard, and, had, them, open, a...","[[possessed, guard, open, gate, containing, gi...","[RPG, Adventure, Strategy]"
4,Baldurs Gate 3,I have only one point of criticism: This game ...,"[RPG, Choices Matter, Story Rich, Character Cu...",2023-09-22,I have only one point of criticism This game l...,"[[I, have, only, one, point, of, criticism, Th...","[[point, criticism, game, lacks, proper, endin...","[RPG, Adventure, Strategy]"
...,...,...,...,...,...,...,...,...
20995,DARK SOULS III,this game iight not as good as Super Auto Pets...,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2023-02-07,this game iight not as good as Super Auto Pets...,"[[this, game, iight, not, as, good, as, Super,...","[[game, iight, good, super, auto, pets, vampir...","[RPG, Adventure, Action, Open World]"
20996,DARK SOULS III,"Compared to the first Dark Souls game, DS3 is ...","[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2017-05-23,Compared to the first Dark Souls game DS3 is y...,"[[Compared, to, the, first, Dark, Souls, game,...","[[compared, dark, souls, game, ds3, young, goo...","[RPG, Adventure, Action, Open World]"
20997,DARK SOULS III,Better then elden ring,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2023-01-18,Better then elden ring,"[[Better, then, elden, ring]]","[[better, elden, ring]]","[RPG, Adventure, Action, Open World]"
20998,DARK SOULS III,Fear the naked invader with a giant sword.,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",2018-11-27,Fear the naked invader with a giant sword,"[[Fear, the, naked, invader, with, a, giant, s...","[[fear, naked, invader, giant, sword]]","[RPG, Adventure, Action, Open World]"


In [15]:
print(reviews_df["Tags"][0])

tags_example = reviews_df["Tags"][0]

filtered_genres = filter_genres(tags_example)
print(filtered_genres)

['RPG', 'Choices Matter', 'Story Rich', 'Character Customization', 'Turn-Based Combat', 'Dungeons & Dragons', 'Adventure', 'CRPG', 'Fantasy', 'Online Co-Op', 'Multiplayer', 'Romance', 'Strategy', 'Singleplayer', 'Co-op Campaign', 'Class-Based', 'Sexual Content', 'Dark Fantasy', 'Combat', 'Controller\t\t\t\t\t\t\t\t\t\t\t\t+']
['RPG', 'Adventure', 'Strategy']


In [95]:
# drop all the unnamed columns

reviews_df = reviews_df.loc[:, ~reviews_df.columns.str.contains('^Unnamed')]

reviews_df

Unnamed: 0,GameName,Reviews,Tags,Date,cleaned_reviews,tokenized_reviews,normalized_reviews,genre
0,Baldurs Gate 3,You know that time when you're playing D&D and...,"[RPG, Choices Matter, Story Rich, Character Cu...",1692577623,You know that time when youre playing DD and y...,"[['You', 'know', 'that', 'time', 'when', 'you'...","[['know', 'time', 'playing', 'dd', 'think', 'm...","[RPG, Adventure, Strategy]"
1,Baldurs Gate 3,You can convince bosses to kill themselves. 10/10,"[RPG, Choices Matter, Story Rich, Character Cu...",1702909919,You can convince bosses to kill themselves 1010,"[['You', 'can', 'convince', 'bosses', 'to', 'k...","[['convince', 'bosses', 'kill']]","[RPG, Adventure, Strategy]"
2,Baldurs Gate 3,my work is really cutting into my Baldur's Gat...,"[RPG, Choices Matter, Story Rich, Character Cu...",1694565640,my work is really cutting into my Baldurs Gate...,"[['my', 'work', 'is', 'really', 'cutting', 'in...","[['work', 'cutting', 'baldurs', 'gate', 'time'...","[RPG, Adventure, Strategy]"
3,Baldurs Gate 3,Possessed a guard and had them open a gate con...,"[RPG, Choices Matter, Story Rich, Character Cu...",1691783669,Possessed a guard and had them open a gate con...,"[['Possessed', 'a', 'guard', 'and', 'had', 'th...","[['possessed', 'guard', 'open', 'gate', 'conta...","[RPG, Adventure, Strategy]"
4,Baldurs Gate 3,I have only one point of criticism: This game ...,"[RPG, Choices Matter, Story Rich, Character Cu...",1695386806,I have only one point of criticism This game l...,"[['I', 'have', 'only', 'one', 'point', 'of', '...","[['point', 'criticism', 'game', 'lacks', 'prop...","[RPG, Adventure, Strategy]"
...,...,...,...,...,...,...,...,...
2095,DARK SOULS III,"This game is: 10% luck, 20% skill, 15% concent...","[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",1568655209,This game is 10 luck 20 skill 15 concentrated ...,"[['This', 'game', 'is', '10', 'luck', '20', 's...","[['game', 'luck', 'skill', 'concentrated', 'po...","[RPG, Adventure, Action, Open World]"
2096,DARK SOULS III,fire keeper rule 34 go hard,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",1687998360,fire keeper rule 34 go hard,"[['fire', 'keeper', 'rule', '34', 'go', 'hard']]","[['fire', 'keeper', 'rule', 'hard']]","[RPG, Adventure, Action, Open World]"
2097,DARK SOULS III,I have lost at least 6 SL200 characters to sys...,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",1637940250,I have lost at least 6 SL200 characters to sys...,"[['I', 'have', 'lost', 'at', 'least', '6', 'SL...","[['lost', 'sl200', 'characters', 'system', 'fa...","[RPG, Adventure, Action, Open World]"
2098,DARK SOULS III,whoever gave me 73 ember while invading me as ...,"[Souls-like, Dark Fantasy, Difficult, RPG, Atm...",1603167767,whoever gave me 73 ember while invading me as ...,"[['whoever', 'gave', 'me', '73', 'ember', 'whi...","[['gave', 'ember', 'invading', 'dark', 'spirit...","[RPG, Adventure, Action, Open World]"


In [28]:
reviews_df.to_csv('reviews_extended.csv', index=False)

In [None]:
# Define a function to convert the string to a list based on brackets
def string_to_list(string):
    result = []
    temp = ''

    for char in string:
        if char == '[':
            if temp:
                result.append(temp)
                temp = ''
        elif char == ']':
            if temp:
                result.append(temp)
                temp = ''
        elif char == ',':
            if temp:
                result.append(temp)
                temp = ''
        else:
            temp += char

    # Append the remaining characters if any
    if temp:
        result.append(temp)

    return result

# Apply the function to each row of the 'normalized_reviews' column
reviews_df['new_normalized_reviews'] = reviews_df['normalized_reviews'].apply(string_to_list)

# Remove apostrophes from the new_normalized_reviews column
reviews_df['new_normalized_reviews'] = reviews_df['new_normalized_reviews'].apply(lambda x: [word.replace("'", "") for word in x])

In [None]:
# Extract words from normalized_reviews
all_words = []

# Iterate through each row of the 'normalized_reviews' column
for row in reviews_df['normalized_reviews']:
    # Remove brackets and split the string into words
    words = row.replace('[', '').replace(']', '').replace("'", "").split(', ')
    # Extend the list of all words with the words from the current row
    all_words.extend(words)

# Create a new DataFrame with each word in a separate row
exploded_df = pd.DataFrame({'Word': all_words})

In [9]:
import pandas as pd

# Reload the CSV file and re-apply the conversion logic
file_path = 'reviews_cleaned.csv'
reviews_df = pd.read_csv(file_path)

# Convert each entry in the "Date" column into a human-readable date, skipping non-numeric entries
reviews_df['Human_Readable_Date'] = pd.to_numeric(reviews_df['Date'], errors='coerce').apply(
    lambda x: pd.to_datetime(x, unit='s', errors='coerce').strftime('%Y-%m-%d') if pd.notnull(x) else None)

# Display the first few rows to confirm the conversion
# reviews_df[['GameName', 'Date', 'Human_Readable_Date_Safe']].head()
reviews_df.drop(columns=['Date'], inplace=True)

reviews_df.to_csv('reviews&date_cleaned.csv', index=False)

In [8]:
file_path = 'reviews&date_cleaned.csv'

reviews_df = pd.read_csv(file_path)

reviews_df

Unnamed: 0,GameName,Reviews,Tags,cleaned_reviews,tokenized_reviews,normalized_reviews,genre,Human_Readable_Date
0,Baldurs Gate 3,You know that time when you're playing D&D and...,"['RPG', 'Choices Matter', 'Story Rich', 'Chara...",You know that time when youre playing DD and y...,"[['You', 'know', 'that', 'time', 'when', 'you'...","[['know', 'time', 'playing', 'dd', 'think', 'm...","['RPG', 'Adventure', 'Strategy']",2023-08-21
1,Baldurs Gate 3,You can convince bosses to kill themselves. 10/10,"['RPG', 'Choices Matter', 'Story Rich', 'Chara...",You can convince bosses to kill themselves 1010,"[['You', 'can', 'convince', 'bosses', 'to', 'k...","[['convince', 'bosses', 'kill']]","['RPG', 'Adventure', 'Strategy']",2023-12-18
2,Baldurs Gate 3,my work is really cutting into my Baldur's Gat...,"['RPG', 'Choices Matter', 'Story Rich', 'Chara...",my work is really cutting into my Baldurs Gate...,"[['my', 'work', 'is', 'really', 'cutting', 'in...","[['work', 'cutting', 'baldurs', 'gate', 'time'...","['RPG', 'Adventure', 'Strategy']",2023-09-13
3,Baldurs Gate 3,Possessed a guard and had them open a gate con...,"['RPG', 'Choices Matter', 'Story Rich', 'Chara...",Possessed a guard and had them open a gate con...,"[['Possessed', 'a', 'guard', 'and', 'had', 'th...","[['possessed', 'guard', 'open', 'gate', 'conta...","['RPG', 'Adventure', 'Strategy']",2023-08-11
4,Baldurs Gate 3,I have only one point of criticism: This game ...,"['RPG', 'Choices Matter', 'Story Rich', 'Chara...",I have only one point of criticism This game l...,"[['I', 'have', 'only', 'one', 'point', 'of', '...","[['point', 'criticism', 'game', 'lacks', 'prop...","['RPG', 'Adventure', 'Strategy']",2023-09-22
...,...,...,...,...,...,...,...,...
2095,DARK SOULS III,"This game is: 10% luck, 20% skill, 15% concent...","['Souls-like', 'Dark Fantasy', 'Difficult', 'R...",This game is 10 luck 20 skill 15 concentrated ...,"[['This', 'game', 'is', '10', 'luck', '20', 's...","[['game', 'luck', 'skill', 'concentrated', 'po...","['RPG', 'Adventure', 'Action', 'Open World']",2019-09-16
2096,DARK SOULS III,fire keeper rule 34 go hard,"['Souls-like', 'Dark Fantasy', 'Difficult', 'R...",fire keeper rule 34 go hard,"[['fire', 'keeper', 'rule', '34', 'go', 'hard']]","[['fire', 'keeper', 'rule', 'hard']]","['RPG', 'Adventure', 'Action', 'Open World']",2023-06-29
2097,DARK SOULS III,I have lost at least 6 SL200 characters to sys...,"['Souls-like', 'Dark Fantasy', 'Difficult', 'R...",I have lost at least 6 SL200 characters to sys...,"[['I', 'have', 'lost', 'at', 'least', '6', 'SL...","[['lost', 'sl200', 'characters', 'system', 'fa...","['RPG', 'Adventure', 'Action', 'Open World']",2021-11-26
2098,DARK SOULS III,whoever gave me 73 ember while invading me as ...,"['Souls-like', 'Dark Fantasy', 'Difficult', 'R...",whoever gave me 73 ember while invading me as ...,"[['whoever', 'gave', 'me', '73', 'ember', 'whi...","[['gave', 'ember', 'invading', 'dark', 'spirit...","['RPG', 'Adventure', 'Action', 'Open World']",2020-10-20


In [None]:
import pandas as pd
import ast

# Path to the CSV file in your Google Drive
file_path = '/content/drive/My Drive/Computational Content Analysis Project/reviews&date_cleaned.csv'

# Define a function to safely convert strings to lists
def convert_to_list(string):
    try:
        return ast.literal_eval(string)
    except Exception as e:
        print(f"Error occurred while converting {string} to list: {e}")
        return []

# Read the CSV file into a DataFrame with the converter function
reviews_df = pd.read_csv(file_path, converters={'normalized_reviews': convert_to_list, 'genre': convert_to_list})

# Extract the first list from each row in 'normalized_reviews' column
organized_reviews = []
for row in reviews_df['normalized_reviews']:
    # Check if the row contains a non-empty list
    if isinstance(row, list) and row:
        organized_reviews.append(row[0])
    else:
        # If the row doesn't contain a non-empty list, append None
        organized_reviews.append(None)

# Add 'organized_reviews' column to the DataFrame
reviews_df['organized_reviews'] = organized_reviews

# Extract the first list from each row in 'genre' column
organized_genre = []
for row in reviews_df['genre']:
    # Check if the row contains a non-empty list
    if isinstance(row, list) and row:
        organized_genre.append(row[0])
    else:
        # If the row doesn't contain a non-empty list, append None
        organized_genre.append(None)

# Add 'organized_genre' column to the DataFrame
reviews_df['organized_genre'] = organized_genre

import gensim
import string

# Ensure each item in 'normalized_tokens' is a list of words
normalized_sentences = reviews_df['organized_reviews']

# Filter out None values from normalized_sentences and remove symbols
filtered_sentences = []
for sentence in normalized_sentences:
    if sentence is not None:
        filtered_sentence = [word for word in sentence if word not in string.punctuation and '?' not in word]
        filtered_sentences.append(filtered_sentence)

# Train Word2Vec model using your dataset
reviewsW2V = gensim.models.word2vec.Word2Vec(filtered_sentences, sg=0)

import gensim
import string

# Ensure each item in 'normalized_tokens' is a list of words
normalized_sentences = reviews_df['organized_reviews']

# Filter out None values from normalized_sentences and remove symbols
filtered_sentences = []
for sentence in normalized_sentences:
    if sentence is not None:
        filtered_sentence = [word for word in sentence if word not in string.punctuation and '?' not in word]
        filtered_sentences.append(filtered_sentence)

# Create a dictionary to store Word2Vec models for each GameName
word2vec_models = {}

# Iterate over unique GameName values
unique_game_names = reviews_df['GameName'].unique()
for game_name in unique_game_names:
    # Filter sentences for the current GameName
    game_name_sentences = [sentence for sentence, name in zip(filtered_sentences, reviews_df['GameName']) if name == game_name]

    # Train Word2Vec model for the current GameName
    word2vec_model = gensim.models.word2vec.Word2Vec(game_name_sentences, sg=0)

    # Store the Word2Vec model for the current GameName in the dictionary
    word2vec_models[game_name] = word2vec_model

import numpy as np
import sklearn.decomposition
import sklearn.manifold

# Filter out words with apostrophes
numWords = 40
targetWords = [word for word in reviewsW2V.wv.index_to_key[:numWords] if "'" not in word]

wordsSubMatrix = []
for word in targetWords:
    wordsSubMatrix.append(reviewsW2V.wv[word])
wordsSubMatrix = np.array(wordsSubMatrix)

pcaWords = sklearn.decomposition.PCA(n_components=40).fit(wordsSubMatrix)
reducedPCA_data = pcaWords.transform(wordsSubMatrix)

# T-SNE is theoretically better, but you should experiment
tsneWords = sklearn.manifold.TSNE(n_components=2).fit_transform(reducedPCA_data)

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(5, 4))
ax = fig.add_subplot(111)
ax.set_frame_on(False)
plt.scatter(tsneWords[:, 0], tsneWords[:, 1], alpha=0)  # Making the points invisible
for i, word in enumerate(targetWords):
    ax.annotate(word, (tsneWords[:, 0][i], tsneWords[:, 1][i]), size=20 * (numWords - i) / numWords)
plt.xticks(())
plt.yticks(())
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def visualize_word_embeddings(word2vec_model):
    # Extract vocabulary and vectors from the Word2Vec model
    vocabulary = word2vec_model.wv.index_to_key
    word_vectors = word2vec_model.wv.vectors

    # Perform dimensionality reduction using t-SNE
    tsne = TSNE(n_components=2, perplexity=5, random_state=42)  # Adjust perplexity here
    embeddings = tsne.fit_transform(word_vectors)

    # Visualize embeddings
    plt.figure(figsize=(10, 8))
    for word, (x, y) in zip(vocabulary, embeddings):
        plt.scatter(x, y)
        plt.annotate(word, (x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    plt.show()

# Visualize embeddings for each Word2Vec model
for game_name, word2vec_model in word2vec_models.items():
    print(f"Visualizing embeddings for {game_name}...")
    visualize_word_embeddings(word2vec_model)
    