# Loading the data 

In [1]:
!pip install -U hopsworks --quiet

In [2]:
!pip install kaggle --quiet

In [3]:
import numpy as np
import pandas as pd

In [7]:
# Path to the TXT file
file_path = "/Users/lassehylleberg/Desktop/Exam/ratebeer/ratebeer.txt"

# Define dictionary to store data, initializing all lists
columns = ['beer/name', 'beer/beerId', 'beer/brewerId', 'beer/ABV', 'beer/style',
           'review/appearance', 'review/aroma', 'review/palate', 'review/taste',
           'review/overall', 'review/time', 'review/profileName', 'review/text']
data = {col: [] for col in columns}

# Read the file line by line and extract data
with open(file_path, "r", encoding="ISO-8859-1") as file:
    current_review = {}
    for line in file:
        line = line.strip()
        if line:
            try:
                key, value = line.split(": ", 1)
                current_review[key] = value
            except ValueError:
                # Skip lines that don't contain the separator
                continue
        else:
            if current_review:
                # Append the data for each key or NaN if the key wasn't found
                for col in columns:
                    data[col].append(current_review.get(col, np.nan))
                current_review = {}

# After the loop, check if there's an incomplete entry remaining to be added
if current_review:
    for col in columns:
        data[col].append(current_review.get(col, np.nan))

# Create DataFrame from the data
df = pd.DataFrame(data)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2924163 entries, 0 to 2924162
Data columns (total 13 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   beer/name           object
 1   beer/beerId         object
 2   beer/brewerId       object
 3   beer/ABV            object
 4   beer/style          object
 5   review/appearance   object
 6   review/aroma        object
 7   review/palate       object
 8   review/taste        object
 9   review/overall      object
 10  review/time         object
 11  review/profileName  object
 12  review/text         object
dtypes: object(13)
memory usage: 290.0+ MB


In [11]:
# Convert numeric columns to float
numeric_cols = ['beer/ABV', 'review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Convert ID columns to integers, coerce errors which will convert non-convertible values to NaN
id_cols = ['beer/beerId', 'beer/brewerId']
df[id_cols] = df[id_cols].apply(pd.to_numeric, errors='coerce', downcast='integer')

# Convert review time to datetime
df['review/time'] = pd.to_datetime(df['review/time'], errors='coerce')

# Check the changes
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2924163 entries, 0 to 2924162
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   beer/name           object        
 1   beer/beerId         float64       
 2   beer/brewerId       int16         
 3   beer/ABV            float64       
 4   beer/style          object        
 5   review/appearance   float64       
 6   review/aroma        float64       
 7   review/palate       float64       
 8   review/taste        float64       
 9   review/overall      float64       
 10  review/time         datetime64[ns]
 11  review/profileName  object        
 12  review/text         object        
dtypes: datetime64[ns](1), float64(7), int16(1), object(4)
memory usage: 273.3+ MB
