
# Classification of Video Game Sales Based on Genre


In [1]:
import pandas as pd
vg_df = pd.read_csv("vgsales.csv")
print(vg_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12819 entries, 0 to 12818
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          12819 non-null  int64  
 1   Name          12819 non-null  object 
 2   Platform      12819 non-null  object 
 3   Year          12616 non-null  float64
 4   Genre         12819 non-null  object 
 5   Publisher     12783 non-null  object 
 6   NA_Sales      12818 non-null  float64
 7   EU_Sales      12818 non-null  float64
 8   JP_Sales      12818 non-null  float64
 9   Other_Sales   12818 non-null  float64
 10  Global_Sales  12818 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.1+ MB
None


In [2]:
vg_df = vg_df.dropna(subset=["Year", "Publisher", "Genre", "Other_Sales"])

## Selecting Features:

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

features = vg_df[["Platform", "Genre", "Publisher", "Year", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]]
target = vg_df["Global_Sales"]

# Train-test split
X_trn, X_tst, y_trn, y_tst = train_test_split(features, target, test_size=0.2, random_state=42)

# Encode categorical features
category_feats = ["Platform", "Genre", "Publisher"]
num_feats = ["Year", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), category_feats),
        ("num", "passthrough", num_feats)
    ]
)


## Linear Regression:

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

linRegModel = Pipeline(steps=[("preprocessor", preprocessor),
                           ("regressor", LinearRegression())]) 

linRegModel.fit(X_trn, y_trn)
print("Linear Regression R²:", linRegModel.score(X_tst, y_tst))

Linear Regression R²: 0.9999944099762287


## Tree-Based

In [7]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Random Forest
model_rf = Pipeline(steps=[("preprocessor", preprocessor),
                           ("regressor", RandomForestRegressor(n_estimators=200, random_state=42))])

model_rf.fit(X_trn, y_trn)
print("Random Forest R²:", model_rf.score(X_tst, y_tst))

# Gradient Boosting
model_gb = Pipeline(steps=[("preprocessor", preprocessor),
                           ("regressor", GradientBoostingRegressor(n_estimators=200, random_state=42))])

model_gb.fit(X_trn, y_trn)
print("Gradient Boosting R²:", model_gb.score(X_tst, y_tst))

Random Forest R²: 0.8181689903246177
Gradient Boosting R²: 0.8389639114237593


## Evaluation:

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = model_rf.predict(X_tst)
print("RMSE:", mean_squared_error(y_tst, y_pred, squared=False))
print("MAE:", mean_absolute_error(y_tst, y_pred))

RMSE: 1.0002975103009797
MAE: 0.056007143991830204


