In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Loading Files

In [None]:
data_game = pd.read_csv("./games.csv")
print("games.csv shape is ", data_game.shape)
print(data_game.head())
print( )

data_user = pd.read_csv("./users.csv")
print("users.csv shape is ", data_user.shape)
print(data_user.head())
print( )

data_rec = pd.read_csv("./recommendations.csv")
print("recommendations.csv shape is ", data_rec.shape)
print(data_rec.head())
print( )

# Draw Plots

In [None]:
data_game['date_release'] = pd.to_datetime(data_game['date_release'], errors='coerce')
data_rec['date'] = pd.to_datetime(data_rec['date'], errors='coerce')

# Positive Ratio 
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.barplot(x='rating', y='positive_ratio', data=data_game, estimator=sum, ci=None)
plt.title('Total Positive Ratio by Rating')
plt.xlabel('Rating')
plt.ylabel('Total Positive Ratio')
plt.xticks(rotation=45, fontsize=8)

# User Reviews
plt.subplot(1, 3, 2)
sns.barplot(x='rating', y='user_reviews', data=data_game, estimator=sum, ci=None)
plt.title('Total User Reviews by Rating')
plt.xlabel('Rating')
plt.ylabel('Total User Reviews')
plt.xticks(rotation=45, fontsize=8)

# Price Final
plt.subplot(1, 3, 3)
sns.barplot(x='rating', y='price_final', data=data_game, estimator=sum, ci=None)
plt.title('Total Price Final by Rating')
plt.xlabel('Rating')
plt.ylabel('Total Price Final')
plt.xticks(rotation=45, fontsize=8)
plt.tight_layout()
plt.show()

# Releasing year
data_game['release_year'] = data_game['date_release'].dt.year
yearly_rating_counts = data_game.groupby(['release_year', 'rating']).size().unstack().fillna(0)

plt.figure(figsize=(15, 5))
yearly_rating_counts.plot(kind='bar', stacked=True, colormap='viridis', ax=plt.gca())
plt.title('Rating Counts by Release Year')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.legend(title='Rating', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, fontsize=8)
plt.tight_layout()
plt.show()

# Clear Data set


In [None]:
data_game = data_game.dropna()
data_user = data_user.dropna()
data_rec = data_rec.dropna()

# Save first 5000 rows of recommendations

In [None]:
rec_5000 = data_rec.head(5000)
rec_5000.to_csv('recommendations_5000.csv', index=False)

data_game.to_csv('games_cleaned.csv', index=False)
data_user.to_csv('users_cleaned.csv', index=False)
rec_5000.to_csv('recommendations_cleaned_5000.csv', index=False)

# Adding Features

# Split Data set

In [None]:
games_cleaned = pd.read_csv('games_cleaned.csv')
users_cleaned = pd.read_csv('users_cleaned.csv')
recommendations_cleaned = pd.read_csv('recommendations_cleaned_5000.csv')

games_train, games_test = train_test_split(games_cleaned, test_size=0.3, random_state=42)
users_train, users_test = train_test_split(users_cleaned, test_size=0.3, random_state=42)
recommendations_train, recommendations_test = train_test_split(recommendations_cleaned, test_size=0.3, random_state=42)

games_train.to_csv('games_train.csv', index=False)
games_test.to_csv('games_test.csv', index=False)
users_train.to_csv('users_train.csv', index=False)
users_test.to_csv('users_test.csv', index=False)
recommendations_train.to_csv('recommendations_train.csv', index=False)
recommendations_test.to_csv('recommendations_test.csv', index=False)

print("Data splitting and file saving completed.")

# Feature Selection

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [None]:
features = ['positive_ratio', 'user_reviews', 'price_final', 'win', 'mac', 'linux']
target = 'rating'

In [None]:
games_train = pd.read_csv('games_train.csv')
games_test = pd.read_csv('games_test.csv')
# recommendations_train = pd.read_csv('recommendations_train.csv')
# recommendations_test = pd.read_csv('recommendations_test.csv')

rating_mapping = {
    'Very Positive': 5,
    'Positive': 4,
    'Mixed': 3,
    'Negative': 2,
    'Very Negative': 1
}

games_train['rating'] = games_train['rating'].map(rating_mapping)
games_test['rating'] = games_test['rating'].map(rating_mapping)

In [None]:
games_train = games_train.dropna()
games_test = games_test.dropna()

In [None]:
X_train = games_train[features]
X_test = games_test[features]
y_train = games_train[target]
y_test = games_test[target]

# Model Creation

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('win_mac_linux', OneHotEncoder(), ['win', 'mac', 'linux'])
    ],
    remainder='passthrough'
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("model predict finish")

# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Store Submission file