In [None]:
##First, I'll import the libraries I will need
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt, log

In [None]:
##Second, I'll define the RMSLE function
def rmsle(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-6, None)  # clip negative values to a small positive value
    return np.sqrt(np.mean(np.square(np.log1p(y_true) - np.log1p(y_pred))))

In [None]:
##Third, I'll load in my dataset
data = pd.read_csv('steam.csv')

In [None]:
##Fourth, I'll select the relevant columns for my model
features = ['avg_players', 'tags', 'genre', 'win', 'mac', 'linux']
target = 'price'
X = data[features]
y = data[target]

In [None]:
##Fifth, I'll process the categorical data
categorical_features = ['tags', 'genre']
numeric_features = ['avg_players', 'win', 'mac', 'linux']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X_preprocessed = preprocessor.fit_transform(X)

In [None]:
##Sixth, I'll split my dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [None]:
##Seventh, I'll create and train the linear regression model
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

In [None]:
##Eigth, I'll make predictions for the evaluations
y_pred_train = linear_regression.predict(X_train)
y_pred_test = linear_regression.predict(X_test)

In [None]:
##Ninth, I'll create the evaluation metrics

# Train set evaluation metrics
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = sqrt(mse_train)
rmsle_train = rmsle(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

# Test set evaluation metrics
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = sqrt(mse_test)
rmsle_test = rmsle(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

In [None]:
##Tenth, I'll print the evaluation metrics for the train and test sets
print("Train set evaluation metrics:")
print(f"Mean Absolute Error (MAE): {mae_train:.2f}")
print(f"Mean Squared Error (MSE): {mse_train:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_train:.2f}")
print(f"Root Mean Squared Logarithmic Error (RMSLE): {rmsle_train:.2f}")
print(f"R squared: {r2_train:.2f}")

print("\nTest set evaluation metrics:")
print(f"Mean Absolute Error (MAE): {mae_test:.2f}")
print(f"Mean Squared Error (MSE): {mse_test:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_test:.2f}")
print(f"Root Mean Squared Logarithmic Error (RMSLE): {rmsle_test:.2f}")
print(f"R squared: {r2_test:.2f}")

Train set evaluation metrics:
Mean Absolute Error (MAE): 1.36
Mean Squared Error (MSE): 3.18
Root Mean Squared Error (RMSE): 1.78
Root Mean Squared Logarithmic Error (RMSLE): 0.24
R squared: 0.99

Test set evaluation metrics:
Mean Absolute Error (MAE): 1.38
Mean Squared Error (MSE): 3.31
Root Mean Squared Error (RMSE): 1.82
Root Mean Squared Logarithmic Error (RMSLE): 0.25
R squared: 0.99


1. **Import libraries:** This chunk imports the necessary Python libraries for data manipulation, preprocessing, model creation, and evaluation.

2. **Define RMSLE function:** This chunk defines a custom function to calculate the Root Mean Squared Logarithmic Error (RMSLE), which is an evaluation metric used to measure the difference between actual and predicted values in a regression problem. The function clips negative predictions to a small positive value and computes the RMSLE using the formula.

3. **Load dataset:** This chunk loads the 'steam.csv' dataset into a pandas DataFrame called data. Pandas is a library used for data manipulation and analysis.

4. **Select relevant columns:** This chunk selects the relevant feature columns ('avg_players', 'tags', 'genre', 'win', 'mac', 'linux') and the target column ('price') for the regression problem. It then creates a feature matrix X and target vector y.

5. **Process categorical data:** This chunk preprocesses the categorical features 'tags' and 'genre' using one-hot encoding. One-hot encoding is a technique that converts categorical variables into a binary vector representation, making them suitable for use in machine learning algorithms. The ColumnTransformer is used to apply one-hot encoding to the categorical columns while keeping the numeric columns unchanged.

6. **Split dataset:** This chunk splits the preprocessed dataset into training and testing sets using the train_test_split function. The training set is used to train the model, while the testing set is used to evaluate its performance.

7. **Create and train the linear regression model:** This chunk creates a Linear Regression model and trains it using the training set.

8. **Make predictions:** This chunk uses the trained model to make predictions on both the training and testing sets. These predictions are used to compute the evaluation metrics.

9. **Create evaluation metrics:** This chunk calculates various evaluation metrics for both the training and testing sets using the actual and predicted values. The metrics include Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), Root Mean Squared Logarithmic Error (RMSLE), and R-squared.

10. **Print evaluation metrics:** This chunk prints the calculated evaluation metrics for both the training and testing sets, allowing you to assess the performance of the linear regression model.

In [None]:
## First, import necessary libraries
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

## Second, define a custom MultiLabelBinarizerWrapper class
class MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, classes=None, sparse_output=False):
        self.classes = classes
        self.sparse_output = sparse_output
        self.encoder = MultiLabelBinarizer(classes=self.classes, sparse_output=self.sparse_output)

    def fit(self, X, y=None):
        self.encoder.fit(X.values.ravel())
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X.values.ravel())

## Third, load the data from CSV and preprocess it
data = pd.read_csv('steam.csv')
data['tags'] = data['tags'].str.split(', ')
data['genre'] = data['genre'].str.split(', ')

## Fourth, extract features and target variable
features = ['avg_players', 'tags', 'genre', 'win', 'mac', 'linux']
target = 'price'
X = data[features]
y = data[target]

## Fifth, process the categorical data using pipelines
tags_pipeline = Pipeline([
    ('encoder', MultiLabelBinarizerWrapper())
])

genre_pipeline = Pipeline([
    ('encoder', MultiLabelBinarizerWrapper())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('tags', tags_pipeline, ['tags']),
        ('genre', genre_pipeline, ['genre'])],
    remainder='passthrough')

X_preprocessed = preprocessor.fit_transform(X)

## Sixth, split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

## Seventh, train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

## Eigth, get user input and preprocess it
tags_input = input("Enter the tag(s) (e.g., Action, Adventure): ")
tags_list = [t.strip() for t in tags_input.split(', ')]

genre_input = input("Enter the genre(s) (e.g., Action, Adventure): ")
genre_list = [g.strip() for g in genre_input.split(', ')]

avg_players = float(input("Enter the average number of players: "))
windows_input = input("Can it be played on Windows? (0 for no, 1 for yes): ")
windows = 1 if windows_input in ['1', 'yes'] else 0

mac_input = input("Can it be played on Mac? (0 for no, 1 for yes): ")
mac = 1 if mac_input in ['1', 'yes'] else 0

linux_input = input("Can it be played on Linux? (0 for no, 1 for yes): ")
linux = 1 if linux_input in ['1', 'yes'] else 0

new_data = pd.DataFrame({'tags': [tags_list], 'genre': [genre_list], 'avg_players': [avg_players], 'win': [windows], 'mac': [mac], 'linux': [linux]})
user_input_preprocessed = preprocessor.transform(new_data)

## Ninth, make a price prediction based on the user input
price_pred = model.predict(user_input_preprocessed)

## Tenth, display the predicted price
print("The predicted price is: ${:.2f}".format(price_pred[0]))


Enter the tag(s) (e.g., Action, Adventure): Co-op, Multiplayer, Survival, Submarine, SurvivalHorror, 2D, Underwater, Simulation, Sci-fi, Horror, Management, Strategy, Action, Difficult, Moddable, Gore, Violent, Singleplayer, Early Access, Naval
Enter the genre(s) (e.g., Action, Adventure): Action, Indie, Simulation, Strategy
Enter the average number of players: 7500
Can it be played on Windows? (0 for no, 1 for yes): 1
Can it be played on Mac? (0 for no, 1 for yes): 0
Can it be played on Linux? (0 for no, 1 for yes): 0
The predicted price is: $29.50




1. **Import libraries:** This chunk imports the necessary Python libraries for data manipulation, preprocessing, model creation, and evaluation. These libraries include pandas for data handling, scikit-learn for machine learning algorithms and model building, and numpy for numerical operations.

2. **Define custom class for encoding multiple labels:** This chunk defines a custom class called MultiLabelBinarizerWrapper that inherits from BaseEstimator and TransformerMixin. This class is a wrapper for the MultiLabelBinarizer from scikit-learn, which is used to convert multiple categorical labels into a binary matrix. This custom class will be used later in the pipeline to process the 'tags' and 'genre' features.

3. **Load and preprocess data:** This chunk loads the 'steam.csv' dataset into a pandas DataFrame called data. It then splits the comma-separated 'tags' and 'genre' columns into lists of individual tags and genres, making it easier to process these categorical features later.

4. **Extract features and target variable:** This chunk selects the relevant feature columns ('avg_players', 'tags', 'genre', 'win', 'mac', 'linux') and the target column ('price') for the regression problem. It then creates a feature matrix X and target vector y.

5. **Process categorical data using pipelines:** This chunk creates two pipelines, one for the 'tags' column and another for the 'genre' column. Both pipelines use the custom MultiLabelBinarizerWrapper class to convert the categorical data into a binary matrix. A ColumnTransformer is used to apply these pipelines to the appropriate columns while keeping the numeric columns unchanged.

6. **Split data into training and test sets:** This chunk uses the train_test_split function to split the preprocessed dataset into training and testing sets. The training set is used to train the model, while the testing set is used to evaluate its performance.

7. **Train linear regression model:** This chunk creates a LinearRegression model and trains it using the training set. The trained model can then be used to make predictions on new data.

8. **Get user input and preprocess it:** This chunk prompts the user to enter information about a game, such as its tags, genre, average number of players, and platform compatibility. It then creates a new DataFrame with the user input and preprocesses it using the same pipelines and ColumnTransformer used for the original dataset.

9. **Make a price prediction based on user input:** This chunk uses the trained linear regression model to make a price prediction for the game with the user-provided information. This prediction is an estimate of the game's price based on the model's understanding of the relationships between the features and the target variable.

10. **Display the predicted price to the user:** This chunk formats and prints the predicted price for the game, giving the user an estimate of what the game's price might be based on the input data.