<a href="https://colab.research.google.com/github/psk2004/Video-Games-Popularity-Analysis/blob/main/game_popularity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Import Data

In [None]:
df_games = pd.read_csv("/content/drive/MyDrive/games.csv")
df_games.head()

In [None]:
df_games.shape                 #Number of rows & columns

In [None]:
df_games.info()

#Data Cleaning

In [None]:
pd.isnull(df_games).sum()    #Checking for null values in the dataset

In [None]:
#Checking for duplicates

duplicate = df_games[df_games.duplicated()]
duplicate

In [None]:
df_games.isna().any()     #checking for NaN

In [None]:
#Displaying the NaN values
team_null = df_games[df_games['Team'].isnull()]
rating_null = df_games[df_games['Rating'].isnull()]
summary_null = df_games[df_games['Summary'].isnull()]
print(team_null,rating_null,summary_null)

In [None]:
tbd_df = df_games[df_games['Release Date'] == 'releases on TBD']     #Checking if a game does not have its release date disclosed
tbd_df

In [None]:
#convert = ['Number of Reviews','Plays','Playing','Backlogs','Wishlist',"Times Listed"]    To convert applicable columns to accomodate floating values to make our analysis easier
columns_to_convert = ['Number of Reviews','Plays','Playing','Backlogs','Wishlist',"Times Listed"]
def convert_k_to_int(x):
    if 'K' in x:
        return int(float(x.replace("K", "")) * 1000)
    else:
        return int(x)

# Apply the custom function to the selected columns
df_games[columns_to_convert] = df_games[columns_to_convert].applymap(convert_k_to_int)
df_games[columns_to_convert]

In [None]:
df_games = df_games.drop(index=1250)                      #Dropping those games that do not have a release date. Pls see the cell that is 2 cells above this for the games having TBD release dates.
df_games = df_games.reset_index(drop = True)

df_games['Release Date'] = pd.to_datetime(df_games['Release Date'])

# Extract numerical components to convert the dates to a numerical value  (YYYY/MM/DD format)
df_games['Year'] = df_games['Release Date'].dt.year
df_games['Month'] = df_games['Release Date'].dt.month
df_games['Day'] = df_games['Release Date'].dt.day



In [None]:
df_games = df_games.rename(columns={"Unnamed: 0":"#"})
df_games

##Inserting new column for analysis

In [None]:
df_games["Popularity"] = df_games["Plays"] + df_games["Wishlist"]                    #Creating a new column 'Popularity' by adding the numbers of those who own the game and those who want the game. Factors like number of reviews, those playing the game now and those who have shelfed them only influence popularity but cannot be considered under it.

In [None]:
df_games            #Displaying the dataset after making the changes

#Analysing our Data

In [None]:
df_games['Rating'].value_counts() #Games being grouped rating wise

In [None]:
df_games['Genres'].value_counts()      #Games being grouped genre wise

In [None]:
df_games['Month'].value_counts()                      #Games being grouped according to the months on which they have released.

In [None]:
plt.figure(figsize=(10, 6))                             #Box-plot to show the distribution of the times, games have been listed
sns.boxplot(df_games["Times Listed"])
plt.title('Distribution of the times, games have been listed')
plt.show()

In [None]:
# Scatter plot to visualize the relationship between plot length and popularity
sns.scatterplot(x=df_games["Rating"], y=df_games["Popularity"], data=df_games)
plt.title('Popularity vs Rating')
plt.xlabel('Rating')
plt.ylabel('Popularity')
plt.xticks(rotation=45)
plt.show()

In [None]:

#We are displaying the top 10 games that are played by most people at present

sorted_df = df_games.groupby("Title")["Playing"].max().reset_index().sort_values(ascending = False,by = "Playing").head(10)
sorted_df

In [None]:
#We are displaying the top 10 games that are shelfed by people i.e., the game which is put in backlog by most people

sorted_df = df_games.groupby("Title")["Backlogs"].max().reset_index().sort_values(ascending = False,by = "Backlogs").head(10)
sorted_df

In [None]:
#We are displaying the top 10 games that has been reviewed the most

sorted_df = df_games.groupby("Title")["Number of Reviews"].max().reset_index().sort_values(ascending = False,by = "Number of Reviews").head(10)
sorted_df

In [None]:
#We are displaying the 25 oldest games in the order of their release dates

sorted_df = df_games.groupby("Title")["Release Date"].max().reset_index().sort_values(ascending = True,by = "Release Date").head(25)
sorted_df

In [None]:

#Here we are displaying 25 highest rated games along with their reviews and popularity to see whether highly rated games were the most popular or not


sorted_df = df_games.groupby(["Rating","Title","Popularity"])["Number of Reviews"].max().reset_index().sort_values(ascending = False,by = "Rating").head(25)
sorted_df

In [None]:
df_games.describe().transpose()    #summary statistics for each numerical column

In [None]:
x = int(input("Enter the index of the game you want to search for (0-1508): "))
df_games[["Title"]].iloc[x]   #To see the xth game recorded in the dataset

In [None]:
#Correlation b/w popularity, ratings and the number of reviews for games.

corr_df = df_games.drop(["Popularity","Rating","Number of Reviews"],axis=1).corr(method="pearson")
plt.figure(figsize=(14,6))
heatmap = sns.heatmap(corr_df,annot = True,fmt = '.1g', vmin = -1, vmax = 1, center = 0, cmap = "inferno", linewidth = 1, linecolor ="Black")
heatmap.set_title("Correlation HeatMap Between Variable")
heatmap.set_xticklabels(heatmap.get_xticklabels(),rotation = 90)

In [None]:
#Scatter plot with regression line to explain how the number of reviews for a game influence popularity

plt.figure(figsize=(10,6))
sns.regplot(data = df_games,y="Popularity",x = "Number of Reviews",color = "c").set(title = " Correlation b/w number of Reviews v/s popularity")

In [None]:
rating_count = df_games['Rating'].value_counts().head(10)
rating_count.index

Float64Index([4.1, 4.0, 3.7, 4.2, 3.9, 3.6, 3.5, 3.8, 4.3, 3.4], dtype='float64')

In [None]:
plt.pie(rating_count.index,labels=rating_count.index,autopct="%1.1f%%",startangle=90)      #Distribution of games for those ratings that are frequently occuring shown in a pie chart
plt.title("Distribution of Ratings")
plt.show()

In [None]:
df_games.groupby("Year")["Number of Reviews"].sum()     #Number of reviews released in a year. As access of technology increased year after year, reviewers got access to more and more games leading to more reviews

In [None]:
df_games.groupby("Year")["Popularity"].sum()     #To see the popularity of video games which has increased over the years.

In [None]:
plt.hist(df_games["Year"],bins = 20, color = "skyblue", edgecolor = "black")          #Releases of games in year which increased due to the increased availability of technology.
plt.title("Releases of games in a year")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.show()

In [None]:
genre_trend_by_popularity = df_games.groupby("Genres")["Popularity"].mean()      #Popularity v/s genres trend over the years.

genre_trend_by_popularity.plot(kind = "line",marker = "o",linestyle="-",color="orange")
plt.title("Popularity trend over genres")
plt.xlabel("Genres")
plt.ylabel("Popularity")
plt.show()

In [None]:
team_popularity_bar_graph = ((df_games.groupby("Team")["Popularity"]).max()).head(25)      #Top 25 most popular video game developers
plt.figure(figsize=(10,10))
team_popularity_bar_graph.plot(kind='bar')
plt.title("Popularity trend over video game makers")
plt.xlabel("Teams")
plt.ylabel("Popularity")
plt.show()

In [None]:
sns.boxplot(df_games["Number of Reviews"])                         #Box plot showing the distribution of number of reviews
plt.title("Box plt: distribution of number of reviews")
plt.show()

In [None]:
total_games_reviews_by_year = df_games.groupby("Year")["#","Number of Reviews"].sum()
total_games_reviews_by_year.sort_values(ascending = False,by = "#").plot(kind = "bar",stacked=True,colormap="viridis")
plt.title("Total games and number of reviews by Year")
plt.xlabel("Year")
plt.ylabel("Total Value")
plt.show()

In [None]:
most_popular = df_games.query('Popularity>32000',inplace = False).sort_values('Popularity',ascending = False)
most_popular[:5]                              #displaying those games that have popualarity above 32000

In [None]:
most_popular = df_games.query('Month==2',inplace = False).sort_values('Release Date',ascending =True)          #Displaying games released in February according to their release date chronologically.
most_popular[:5]

In [None]:
team_rating_bar_graph = ((df_games.groupby("Title")["Popularity"]).max()).head(10)      #Popularity among 10 games in their increasing alphabetical order.
plt.figure(figsize=(10,10))
team_rating_bar_graph.plot(kind='bar')
plt.title("Games v/s ratings")
plt.xlabel("Games")
plt.ylabel("Ratings")
plt.show()

In [None]:
sorted_df = df_games.groupby("Title")["Rating"].max().reset_index().sort_values(ascending = False,by = "Rating").head(25)      #Top 10 highest rated games
sorted_df

In [None]:
sorted_df = df_games.groupby("Title")["Wishlist"].max().reset_index().sort_values(ascending = False,by = "Wishlist").head(10)      #Top 10 wishlisted games
sorted_df

In [None]:
sorted_df = df_games.groupby(["Title","Rating"])["Backlogs"].max().reset_index().sort_values(ascending = False,by = "Backlogs").head(10)      # Games that are highly rated but are shelfed
sorted_df

In [None]:
sorted_df = df_games.groupby(["Title","Rating"])["Playing"].max().reset_index().sort_values(ascending = False,by = "Playing").head(10)      # Games that people have been playing in the recent times.
sorted_df

In [None]:
sorted_df.plot(kind='scatter', x='Rating', y='Playing', s=32, alpha=.8)        #Scatterplot showing people playing games based on their ratings
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
#Correlation b/w those who have games, who play them and who have shelfed them

corr_df = df_games.drop(["Plays","Playing","Backlogs"],axis=1).corr(method="pearson")
plt.figure(figsize=(14,6))
heatmap = sns.heatmap(corr_df,annot = True,fmt = '.1g', vmin = -1, vmax = 1, center = 0, cmap = "inferno", linewidth = 1, linecolor ="Black")
heatmap.set_title("Correlation b/w those who have games, who play them and who have shelfed them")
heatmap.set_xticklabels(heatmap.get_xticklabels(),rotation = 90)