In [None]:
# Data analysis 
import numpy as np
import pandas as pd

#Data visualization 
import matplotlib.pyplot as plt
import seaborn as sns
from _plotly_future_ import v4_subplots
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go 
import plotly.graph_objects as go

# Imputing missing values
from scipy.stats import chi2_contingency

#Feature 
#Rank - Ranking of overall sales
#Name - The games name
#Platform - Platform of the games release (i.e. PC,PS4, etc.)
#Year - Year of the game's release
#Genre - Genre of the game
#Publisher - Publisher of the game
#NA_Sales - Sales in North America (in millions)
##EU_Sales - Sales in Europe (in millions)
#JP_Sales - Sales in Japan (in millions)
#Other_Sales - Sales in the rest of the world (in millions)
#Global_Sales - Total worldwide sales.

## Data Exploration 

In [None]:
vg = pd.read_csv("../input/videogamesales/vgsales.csv")   # to import data 

In [None]:
vg.head(10)  # to show first 10 row of data 

In [None]:
vg.info()  # to see informaton about feature 

In [None]:
vg.describe()  # to see som info about mean &min & max & standard deviation 

In [None]:
vg.columns  # to see the coulmns

In [None]:
vg.shape # to see the number of data set

In [None]:
#see duplicated data or not 
vg.drop_duplicates(subset=['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'])

In [None]:
vg.drop(['Rank'],axis=1,inplace=True)  # will drop Rank 

In [None]:
vg.describe()  #to see what change afte  dropping rank 

In [None]:
vg.shape

In [None]:
vg.isnull().sum() # to see missing data 

In [None]:
sns.heatmap(vg.isnull(), cbar=False)  # to see missing data in figure 

In [None]:
# Remove rows with null values
vg.dropna(inplace= True)


In [None]:
vg.isnull().sum()  # to see data after solveing missing data 

In [None]:
vg.shape

## Data visualization 

In [None]:
# to see the relation between years and Globel sales 
import plotly  
import plotly.express as px
grouped = vg.groupby(vg.Year)[["Global_Sales"]].sum()
grouped = grouped.sort_values(by = "Global_Sales" , ascending = False)
grouped = grouped.head(10)

# plottng 

fig = px.pie(data_frame = grouped , 
            names = grouped.index , 
            values = "Global_Sales" , 
            template = "seaborn" , 
            hole = 0.4 , 
            color_discrete_sequence = px.colors.sequential.Inferno , 
            )

fig.update_layout(title = "Top 10 years for gaming market", 
                  paper_bgcolor = "rgb(230,230,230)" , 
                 plot_bgcolor = "rgb(243,243,243)" , 
                 annotations= [dict(text = "Global Sales" , font_size = 20 , showarrow = False , opacity = 0.7)])

fig.update_traces (rotation = 90 , pull = 0.01, textinfo = "percent+label")
fig.show()

In [None]:
# Total Revenue by region  
region_sales = vg.iloc[:,-5:-1] # NA_Sales column to Other_Sales column
region_sales = region_sales.sum(axis=0).reset_index()
region_sales.columns = ['Region','Sales']
region_sales

In [None]:
# Barplot showing total revenue by region
plt.figure(figsize=(12,8))
sns.barplot(x='Region', y='Sales', data=region_sales, palette='viridis')
plt.title("Revenue by Region",fontsize=15)
plt.ylabel("Sales (in millions)",fontsize=12)
plt.show()

In [None]:
# to see the curve of year with global sales 
total_sales_year = pd.pivot_table(vg,index = "Year",values = "Global_Sales",aggfunc= np.sum)
#print(avg_sales_year_wise) #uncomment this to know Total sale's value's with respect each year.
plt.plot(total_sales_year.index,total_sales_year["Global_Sales"],color = 'red',marker = "*")
plt.title("Total Sale's Year Wise")
plt.xlabel("Year's")
plt.ylabel("avg global_sales")

In [None]:
# Another graph to show years with global sales 
plt.rcParams['figure.figsize'] = (20,15)
year_of_game_sales  = pd.pivot_table(vg ,index = "Year" ,
                                       values = "Global_Sales",
                                       aggfunc = np.sum)
sns.barplot(year_of_game_sales["Global_Sales"],year_of_game_sales.index,orient = "h")
plt.title("Year of global game sales :")

In [None]:
##visualize year  with globel sales in scattorplot
plt.figure(figsize=(12,8))
sns.scatterplot(x='Year',y='Global_Sales',data=vg)

# Name@Global sales

In [None]:
Name_of_game = vg.groupby('Name').sum() # create new df with sales by name of game 
Name_of_game.sort_values('Global_Sales', ascending = False, inplace = True) # Sort by global sales, descending
Name_of_game.head(10) # Check out the top 10

In [None]:
# to show name of game in graph  with globel sales 
import plotly
import plotly.express as px
Name_of_game['Name'] = Name_of_game.index

#Lets use Plotly to throw up a simple pie chart
pie_chart = px.pie(
            data_frame = Name_of_game.head(10),
            values = 'Global_Sales',
            names = 'Name')
pie_chart

# Publisher&global Sales 

In [None]:
sales_by_publisher = vg.groupby('Publisher').sum() # create new df with sales by publisher
sales_by_publisher.sort_values('Global_Sales', ascending = False, inplace = True) # Sort by global sales, descending
sales_by_publisher.head(10) # Check out the top 10

In [None]:
# to see the relation with publesher and globel saels in  figure 
sales_by_publisher['Publisher'] = sales_by_publisher.index

#Lets use Plotly to throw up a simple pie chart
pie_chart = px.pie(
            data_frame = sales_by_publisher.head(10),
            values = 'Global_Sales',
            names = 'Publisher')
pie_chart

In [None]:
# Top3 seller  wise global game sales
plt.rcParams['figure.figsize'] = (8,6)
type_of_the_game  = pd.pivot_table(vg ,index = "Publisher",
                                           values = "Global_Sales",
                                           aggfunc = np.sum)
#type_of_the_game.sort_values("global_sales",ascending=False).head()
type_of_the_game  = type_of_the_game.sort_values(
                    by = "Global_Sales",ascending  = False).head(3)

sns.barplot(type_of_the_game["Global_Sales"],type_of_the_game.index,orient = "h")
plt.title("Top3 seller  wise global game sales :")

# Platform&global_sales


In [None]:
# to show platform with globel sales 
sales_by_publisher = vg.groupby('Platform').sum() # create new df with sales by publisher
sales_by_publisher.sort_values('Global_Sales', ascending = False, inplace = True) # Sort by global sales, descending
sales_by_publisher.head(10) # Check out the top 10

In [None]:
sales_by_publisher['Platform'] = sales_by_publisher.index

#Lets use Plotly to throw up a simple pie chart
pie_chart = px.pie(
            data_frame = sales_by_publisher.head(10),
            values = 'Global_Sales',
            names = 'Platform')
pie_chart

In [None]:
#another way to see platform in chart graph 
plt.rcParams['figure.figsize'] = (8,6)
platform_wise_game_sales  = pd.pivot_table(vg ,index = "Platform",
                                           values = "Global_Sales",
                                           aggfunc = np.sum)
#platform_wise_game_sales.sort_values("global_sales",ascending=False).head()
platform_wise_game_sales  = platform_wise_game_sales.sort_values(
                    by = "Global_Sales",ascending  = False).head(10)

sns.barplot(platform_wise_game_sales["Global_Sales"],platform_wise_game_sales.index,orient = "h")
plt.title("Top 10 Platform wise global game sales :")

In [None]:
# Platform and its sales acorss 
platform_sales = vg.groupby('Platform').sum().sort_values(by='Global_Sales',ascending = False).iloc[:,2:]
platform_sales.head()

In [None]:
#visualize platform with globel sales in scattorplot 
plt.figure(figsize=(30,8))
sns.scatterplot(x='Platform',y='Global_Sales',data=vg)

# Genre of game

In [None]:
#to show in chart the genre of games 
plt.rcParams['figure.figsize'] = (8,6)
type_of_the_game  = pd.pivot_table(vg ,index = "Genre",
                                           values = "Global_Sales",
                                           aggfunc = np.sum)
#type_of_the_game.sort_values("global_sales",ascending=False).head()
type_of_the_game  = type_of_the_game.sort_values(
                    by = "Global_Sales",ascending  = False).head(10)

sns.barplot(type_of_the_game["Global_Sales"],type_of_the_game.index,orient = "h")
plt.title("Top 10 Genre wise global game sales :")

In [None]:
#Top3 games  wise global game sales
plt.rcParams['figure.figsize'] = (8,6)
type_of_the_game  = pd.pivot_table(vg ,index = "Genre",
                                           values = "Global_Sales",
                                           aggfunc = np.sum)
#type_of_the_game.sort_values("global_sales",ascending=False).head()
type_of_the_game  = type_of_the_game.sort_values(
                    by = "Global_Sales",ascending  = False).head(3)

sns.barplot(type_of_the_game["Global_Sales"],type_of_the_game.index,orient = "h")
plt.title("Top3 games  wise global game sales :")

In [None]:
a = [(vg.Genre=='Action').sum(), (vg.Genre=='Sports').sum(),
     (vg.Genre=='Shooter').sum(), (vg.Genre=='Role-Playing').sum(),
     (vg.Genre==' Misc').sum(), (vg.Genre=='Strategy').sum(),
     (vg.Genre=='Puzzle').sum(), (vg.Genre=='Simulation').sum(),
     (vg.Genre=='Racing').sum(), (vg.Genre=='Fighting').sum(), (vg.Genre=='Platform').sum()]
print(a)

In [None]:
# to show spread of genre of game with globel sales 
vgs = vg.groupby(['Year','Genre']).agg({'Global_Sales':np.sum})

ax = vgs['Global_Sales'].unstack().plot.bar(stacked=True, figsize=(15,10))
plt.legend(title='Game Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_title('Global Sales')
ax.set_ylabel('Sales (Millions)')
ax.set_xlabel('Years')

In [None]:
#visualize Genre with globel sales in scattorplot
plt.figure(figsize=(20,10))
sns.scatterplot(x='Genre',y='Global_Sales',data=vg)

# Sales catogrie

In [None]:
##visualize reagon with year in globel sales  
plt.figure(figsize=(30,8))
sns.scatterplot(x='Platform',y='Global_Sales',data=vg)
n=vg.groupby('Year').NA_Sales.sum()
j=vg.groupby('Year').JP_Sales.sum()
e=vg.groupby('Year').EU_Sales.sum()
o=vg.groupby('Year').Other_Sales.sum()
plt.figure(figsize=(15,10))
plt.grid()
plt.plot(n.index,n.values,label='North America')
plt.plot(j.index,j.values,label='Japan')
plt.plot(e.index,e.values,label='Europe')
plt.plot(o.index,o.values,label='Others')
plt.xticks(n.index,rotation='vertical',size=8)
plt.legend()
plt.ylabel('Sales [in millions]')
plt.xlabel('Year')

In [None]:
# year=float(input("Enter the year: "))
x=vg.groupby('Year').sum()
x=x.drop(columns='Global_Sales')
year=2006 #                                            
td=dict(x.loc[year])
plt.figure(figsize=(9,7))
plt.title("Distribution of Game Sales in "+str(year))
plt.pie(td.values(),autopct='%.2f %% ',labels=td.keys())
plt.show()

In [None]:
#3D scatter plot between North America , Europe and Japan Sales
fig = px.scatter_3d(data_frame = vg[vg["Year"] > 2000].sort_values(by = "Year" , ascending = True) , 
                  x = "NA_Sales" , 
                  y = "EU_Sales" , 
                  z = "JP_Sales" , 
                  labels = {
                      "NA_Sales" : "NA Sales" , 
                      "EU_Sales" : "EU Sales" , 
                      "JP_Sales" : "JP Sales"
                  } , 
                  animation_frame = "Year" , 
                  size = "Other_Sales" , 
                  width = 800 , 
                  height = 600 , 
                  size_max = 50 , 
                  opacity = 0.8 , 
                  color = "Genre")


fig.update_layout(title = "3D scatter plot between North America , Europe and Japan Sales" , 
                  
                 paper_bgcolor = 'rgb(230,230,230)' , 
                 plot_bgcolor = "rgb(243,243,243)" , 
                 showlegend = True)
fig.show()

# figure of relation

In [None]:
# count of games at evry year 
table_count = pd.pivot_table(vg,values=['Global_Sales'],index=['Year'],columns=['Genre'],aggfunc='count',margins=False)

plt.figure(figsize=(19,16))
sns.heatmap(table_count['Global_Sales'],linewidths=.5,annot=True,fmt='0.3f',vmin=0)
plt.title('Count of games')

In [None]:
#VISUALIZATIONS
# Correlation Heatmap  confirms what found with the pairplot
cor_mat=vg.corr()
fig,ax=plt.subplots(figsize=(15,10))
sns.heatmap(cor_mat,annot=True,linewidths=0.5,fmt=".3f")

# train_test_split

In [None]:
vg.columns

In [None]:
X = vg[[ 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]
y = vg['Global_Sales']


In [None]:
vg['Global_Sales'].value_counts()

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 25, random_state = 42)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score


#Decision **tree**

In [None]:
dtr = DecisionTreeRegressor(max_depth=4,max_features=4,random_state = 42)
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)

In [None]:
print(f'Score of Decision Tree Regressor Model: {dtr.score(X_train, y_train) * 100}%')


In [None]:
print(f'Score of Decision Tree Regressor Model: {dtr.score(X_test, y_test) * 100}%')


In [None]:
print(f'Mean Absolute Error: ' + str(mean_absolute_error(dtr_pred, y_test)) + ' - Decision Tree Regressor Model')

#Random forest

In [None]:
rfg = RandomForestRegressor(max_depth=5,max_features=4,random_state =0)
rfg.fit(X_train, y_train)
rfg_pred = rfg.predict(X_test)

In [None]:
print(f'Score of Random Forest Regressor Model: {rfg.score(X_train, y_train) * 100}%')

In [None]:
print(f'Score of Random Forest Regressor Model: {rfg.score(X_test, y_test) * 100}%')


In [None]:
print(f'Mean Absolute Error: ' + str(mean_absolute_error(rfg_pred, y_test)) + ' - Random Forest Regressor')