**This dataset contains a list of video games with sales greater than 100,000 copies. It was generated by a scrape of [vgchartz.com].**

**Fields include**

**Rank - Ranking of overall sales**

**Name - The games name**

**Platform - Platform of the games release (i.e. PC,PS4, etc.)**

**Year - Year of the game's release**

**Genre - Genre of the game**

**Publisher - Publisher of the game**

**NA_Sales - Sales in North America (in millions)**

**EU_Sales - Sales in Europe (in millions)**

**JP_Sales - Sales in Japan (in millions)**

**Other_Sales - Sales in the rest of the world (in millions)**

**Global_Sales - Total worldwide sales.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Loading DataSet

In [None]:
raw_data=pd.read_csv('../input/videogamesales/vgsales.csv')

In [None]:
raw_data.head(20)

In [None]:
raw_data.tail(30)

In [None]:
data=raw_data.copy()

In [None]:
data.Global_Sales.head(20).plot(kind='bar');

## ***Checking year vs global sales :-***

In [None]:

fig,(ax1,ax2)=plt.subplots(2,1)
ax1=ax1.scatter(data['Year'][:1000],data['Global_Sales'][:1000])

ax2=ax2.bar(data['Genre'][:1000],data['Global_Sales'][:1000])




In [None]:
data.isna().sum()

In [None]:
data['Year'].median()

## ***FILLING THE YEAR COLUMN BY MEDIAN:-***

In [None]:
data['Year']=data['Year'].fillna(data['Year'].median())

In [None]:
data.Publisher.value_counts()

In [None]:
#filling the  publisher 's null values by EA GAMES
data['Publisher']=data['Publisher'].fillna('Electronic Arts')

In [None]:
data.isna().sum()

In [None]:
data.Publisher.head()

## **---Visualsing the Global Sales v/s Name---**

In [None]:
x1= data.groupby("Name").Global_Sales.sum().sort_values(ascending= False).head(30)
plt.figure(figsize= (7,10))
sns.set_style("whitegrid")
ax= sns.barplot(x1.values,x1.index)
ax.set_xlabel("global sales(in million)");

## **Visualising the Relationship between Global Sales V/S Genre---**

In [None]:
x2= data.groupby("Genre").Global_Sales.sum().sort_values(ascending=False).head(20)
plt.figure(figsize= (5,10))
sns.set_style("whitegrid")
ax= sns.barplot(x2.values,x2.index)
ax.set_xlabel("global sales(in million)")
plt.xticks(rotation=90);

In [None]:
#Checking Correlation:
co_rel=data.corr()
top_feature=co_rel.index
plt.figure(figsize=(10,10))
sns.heatmap(data[top_feature].corr(),annot=True,linewidths=.5)

plt.show()

In [None]:
lables_to_include=['NA_Sales','EU_Sales','JP_Sales','Other_Sales']
y=data['Global_Sales']
x=data[lables_to_include]

In [None]:
y.shape

In [None]:
x.shape

In [None]:
#Now Performing Regression:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape,y_train.shape

In [None]:
from sklearn.linear_model import LinearRegression

reg=LinearRegression()
reg.fit(X_train,y_train)

In [None]:
y_pred=reg.predict(X_test)
y_pred

In [None]:
#Analyzing the Errors and Accuracy
from sklearn.metrics import r2_score,mean_squared_error
r2_score_pred=r2_score(y_test,y_pred)
rms_value=mean_squared_error(y_test,y_pred)
print('R2 score is:',r2_score_pred)
print('mean_squared_error:',rms_value)


## **Now,Let's Try the other regression models----**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
#Creating Model Dictionary
model_list={"LinearRegression":LinearRegression(),
             "SVM":SVR(),
           "RandomForestRegressor":RandomForestRegressor()
           }

In [None]:
result={}


for name,model in model_list.items():
    model.fit(X_train,y_train)
    result[name]=model.score(X_test,y_test)
    
result    
    

## **Visualizing the Models Performance:-**

In [None]:
result_df=pd.DataFrame(result.values(),result.keys(),
                      columns=['Accuracy'])
result_df.plot.bar();