# Data Preprocessing

STEP1 : Importing libraries


In [18]:
import pandas as pd
import numpy as np

STEP2 : Read Dataset

In [19]:
steam_data=pd.read_csv('steam2.csv')
steam_data.shape 
#steam_data = steam_data.sample(2000).reset_index(drop=True)


(2000, 18)

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#head
steam_data.head()

In [None]:
#tail
steam_data.tail()

STEP3 : sanity check of the data


In [17]:
#shape
steam_data.shape

(2000, 19)

In [None]:
#info()
steam_data.info()

In [None]:
#finding missing value
steam_data.isnull().sum()

In [None]:
#finding percentage of the missing value
steam_data.isnull().sum()/steam_data.shape[0]*100

In [None]:
#finding duplicates
steam_data.duplicated(subset = ["appid"] ).sum()

In [None]:
#Summary statistics (mean, median, etc.)
print("\nSummary statistics:")
print(steam_data.describe())

In [None]:
#identifing garbage value
for i in steam_data.select_dtypes(include='object').columns:
  print(steam_data[i].value_counts())
  print("***"*10)
  # print(i)
  # print(steam_data[i].unique())

STEP4: Exploratory Data Analysis
(EDA)

In [None]:
#descriptive statistics
steam_data.describe()

In [None]:
steam_data.describe(include='object').T

In [None]:
#histogram to understand the distribution
# steam_data.hist(figsize=(15,10))
# plt.show()
for i in steam_data.select_dtypes(include='number').columns:
  sns.histplot(steam_data[i])
  plt.show()

In [None]:
#Boxplot to identify outliers
for i in steam_data.select_dtypes(include='number').columns:
  sns.boxplot(data=steam_data,x=i)
  plt.show()

In [None]:
#scatter plot to understand the relationship
for i in ['appid', 'english', 'required_age', 'achievements',
       'negative_ratings', 'average_playtime', 'median_playtime', 'price']:
  sns.scatterplot(data=steam_data,x=i,y='positive_ratings')
  plt.show()

In [None]:
steam_data.select_dtypes(include='number').columns

In [None]:
#corelation with heatmap to interpret the relation and multicolliniarity
s=steam_data.select_dtypes(include='number').corr()
plt.figure(figsize=(32,18))
sns.heatmap(s,annot=True)

In [None]:
#finding duplicates
steam_data.duplicated().sum()

# Model Building



### Content-Based Filtering Implementation


Vectorizing genres, tags, developer

In [20]:
#Check for NaN Values
print(steam_data[['genres', 'developer', 'steamspy_tags', 'categories','publisher']].isnull().sum())

genres           0
developer        0
steamspy_tags    0
categories       0
publisher        0
dtype: int64


In [4]:
#Fill NaN Values with an empty string
steam_data['developer'] = steam_data['developer'].fillna('')
steam_data['publisher'] = steam_data['publisher'].fillna('')

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine relevant columns into a single column for vectorization
steam_data['combined_features'] = steam_data['genres'] + ' ' + steam_data['developer'] + ' ' + steam_data['steamspy_tags'] + ' ' + steam_data['categories'] + steam_data['publisher']

# Use TF-IDF Vectorizer to transform the combined features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(steam_data['combined_features'])

# Check the shape of the tfidf_matrix
print(tfidf_matrix.shape)


(2000, 3354)


Calculate cosine Similarity Between Games


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between all games
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display the cosine similarity matrix shape
print(cosine_sim.shape)

(2000, 2000)


Game Recommendation Function

In [23]:
# Recommendation Function
def get_recommendations(game_title, cosine_sim=cosine_sim):
    # Check if the game title exists in the DataFrame
    if game_title not in steam_data['name'].values:
        print(f"Game title '{game_title}' not found in the dataset.")
        return None  # Return None or an empty list

    # Get the index of the game that matches the title
    idx = steam_data[steam_data['name'] == game_title].index[0]

    # Get the pairwise similarity scores of all games with that game
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the games based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top N most similar games
    sim_scores = sim_scores[1:11]  # Exclude the first game (itself)

    # Get the game indices
    game_indices = [i[0] for i in sim_scores]

    # Return the top N most similar games
    return steam_data['name'].iloc[game_indices]

In [24]:
# Print out a sample of game titles to verify
print(steam_data['name'].sample(10))  # Prints 10 random game titles

1118                            Rise to Ruins
1097                            NEO Scavenger
1103           Friday the 13th: Killer Puzzle
954          Crash Bandicoot™ N. Sane Trilogy
635                                    UnEpic
1116                       Narcissu 1st & 2nd
1538                             Renegade Ops
1408    ATOM RPG: Post-apocalyptic indie game
750                                 Her Story
109                                FOR HONOR™
Name: name, dtype: object


In [25]:
# Example: Get top 10 recommendations for a specific game
game_title = 'Unreal Tournament 3 Black'  # Replace with an actual game name
recommendations = get_recommendations(game_title)
if recommendations is not None:
    print(recommendations)


1353        Unreal Tournament: Game of the Year Edition
1949                                        Unreal Gold
1315    Unreal Tournament 2004: Editor's Choice Edition
832                                       Murder Miners
1614                             Alien Rage - Unlimited
1198                                               20XX
702                                 RUNNING WITH RIFLES
940        Boring Man - Online Tactical Stickman Combat
1313                                        Enemy Front
657                                          Battleborn
Name: name, dtype: object


In [26]:
import pickle

In [27]:
pickle.dump(steam_data.to_dict(),open('game_dict.pkl','wb'))
pickle.dump(cosine_sim,open('similarity.pkl','wb'))
#