##  <span style="color:#4d7799"> Video Game Sales </span>
<span style="color:#7fa4c4">This dataset contains a list of video games with sales greater than 100,000 copies. It was generated by a scrape of vgchartz.com.</span>
- <span style="color:#b5515b">Fields include</span>
- <span style="color:#b5515b">Rank - Ranking of overall sales</span>
- <span style="color:#b5515b">Name - The games name </span>
- <span style="color:#b5515b">Platform - Platform of the games release (i.e. PC,PS4, etc.)</span>
- <span style="color:#b5515b">Year - Year of the game's release</span>
- <span style="color:#b5515b">Genre - Genre of the game</span>
- <span style="color:#b5515b">Publisher - Publisher of the game</span>
- <span style="color:#b5515b">NA_Sales - Sales in North America (in millions)</span>
- <span style="color:#b5515b">EU_Sales - Sales in Europe (in millions)</span>
- <span style="color:#b5515b">JP_Sales - Sales in Japan (in millions)</span>
- <span style="color:#b5515b">Other_Sales - Sales in the rest of the world (in millions)</span>
- <span style="color:#b5515b">Global_Sales - Total worldwide sales.

There are 16,598 records. 2 records were dropped due to incomplete information.


##### [ReadData](#1)
##### [Clean Data](#2)
##### [DATA ANALYSIS](#3)
##### [Machine Learning](#4)

##  <span style="color:#4d7799"> Importing libraries and Set Options </span>

In [None]:
# imports
! pip install bar_chart_race
! pip install chart_studio
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import matplotlib.style as style
from matplotlib.colors import ListedColormap
from matplotlib import cm
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
from wordcloud import WordCloud,STOPWORDS
from PIL import Image
import bar_chart_race as bcr
init_notebook_mode(connected=True)
%matplotlib inline

#Plotly Libraris
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
from plotly import tools
import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.model_selection import train_test_split
from  sklearn import linear_model , metrics
from  sklearn.metrics import accuracy_score, confusion_matrix , recall_score , precision_score , f1_score ,classification_report,plot_confusion_matrix



#options
sns.set(style='darkgrid')
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)
# Notbook option
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# chart_studio options
username ="hanyelshafey"
api_key = ""
tls.set_credentials_file(username,api_key)


##  <span style="color:#4d7799"> Read Data</span>

<a id='1'></a>

In [None]:
df_vg=pd.read_csv('/kaggle/input/videogamesales/vgsales.csv')

## <span style="color:#4d7799"> Assess Data</span>

##  <span style="color:#4d7799"> Visual assess</span>

In [None]:
df_vg.head(30)

In [None]:
df_vg.shape

In [None]:
df_vg.sample(25)

In [None]:
df_vg['id']=df_vg.index

##  <span style="color:#4d7799"> Programmatically assess</span>

In [None]:
df_vg.info()

In [None]:
df_vg.shape

In [None]:
df_vg.describe()

In [None]:
df_vg['Global_Sales'].describe()

In [None]:
df_vg.loc[df_vg['Year']==2020.0]

In [None]:
df_vg.loc[df_vg['Rank']==14779]

In [None]:
for ind in df_vg.index :    
    if df_vg.NA_Sales[ind] + df_vg.EU_Sales[ind] + df_vg.JP_Sales[ind] + df_vg.Other_Sales[ind] != df_vg.Global_Sales[ind] :
        print (ind,df_vg.NA_Sales[ind] + df_vg.EU_Sales[ind] + df_vg.JP_Sales[ind] + df_vg.Other_Sales[ind],df_vg.Global_Sales[ind])

In [None]:
df_vg.loc[df_vg['id']==37]

In [None]:
df_vg.isna().sum()

In [None]:
df_vg.loc[df_vg['Year'].isna() ]

In [None]:
df_vg.loc[df_vg['Publisher'].isna() ]

In [None]:
df_vg.Name.value_counts()

In [None]:
df_vg.Platform.value_counts()

In [None]:
df_vg.Genre.value_counts()

In [None]:
df_vg.Publisher.value_counts()

In [None]:
df_vg.Year.min() ,df_vg.Year.max()

In [None]:
df_vg.duplicated().sum()

## <span style="color:#4d7799"> Issues</span>
### <span style="color:#d48e95">Quality</span>

###### 1-271 value in year  column is Nan and 58 value in Publisher column is Nan 
###### 2-year Dtype shoud be Date type
###### 3-global sales column is not equal to the summation of (NA_Sales	EU_Sales	JP_Sales	Other_Sales)

### <span style="color:#d48e95">Tidiness</span>
##### Columns NA_Sales	EU_Sales	JP_Sales	Other_Sales	 was named in a confusing way. names should be clear </span>

# <span style="color:#4d7799"> Cleaning </span>

<a id='2'></a>

### Define
####  1-Drope Nan Values  
### Code

In [None]:
df_vg=df_vg.dropna(axis=0)

## Test

In [None]:
df_vg.isna().sum()

### Define
##### 2-Change year column to Date type
### Code

In [None]:
df_vg.Year.dtype

In [None]:
df_vg['Year']=pd.to_datetime(df_vg['Year'],format='%Y%m%d',  errors='ignore')

## Test

In [None]:
df_vg.Year

## Define
### 3-change global sales column to the summation of (NA_Sales,EU_Sales,JP_Sales)
### Code

In [None]:
df_vg['Global_Sales']=df_vg.NA_Sales+ df_vg.EU_Sales + df_vg.JP_Sales + df_vg.Other_Sales

## Test

In [None]:
for ind in df_vg.index :    
    if df_vg.NA_Sales[ind] + df_vg.EU_Sales[ind] + df_vg.JP_Sales[ind] + df_vg.Other_Sales[ind] != df_vg.Global_Sales[ind] :
        print (ind,df_vg.NA_Sales[ind] + df_vg.EU_Sales[ind] + df_vg.JP_Sales[ind] + df_vg.Other_Sales[ind],df_vg.Global_Sales[ind])

In [None]:
df_vg.loc[df_vg['id']==37 ]

## Define
### Tidiness
 ### Change NA_Sales ,EU_Sales,JP_SalesColumns names
### Code

In [None]:
df_vg.columns

In [None]:
df_vg.rename(columns = {'NA_Sales':'NorthAmerica_Sales','EU_Sales':'Europe_Sales','JP_Sales':'Japan_Sales'}, inplace = True)

## Test

In [None]:
d=df_vg.head(10)
table = ff.create_table(d)
for i in range(len(table.layout.annotations)):
    table.layout.annotations[i].font.size = 9
iplot(table)
py.plot(table,filename='Data Frame after cleaning',auto_open=True)

In [None]:
df_vg.shape

# <span style="color:#4d7799"> STORING DATA </span>

In [None]:
df_vg.to_csv('Videogame_sales.csv',encoding='utf-8',index=False)

In [None]:
df_Video_Games=pd.read_csv('Videogame_sales.csv')

In [None]:
Video_Games=df_Video_Games.copy()
Video_Games.columns

In [None]:
Video_Games.head()

#  <span style="color:#4d7799"> DATA ANALYSIS  </span>

<a id='3'></a>

### <span style="color:#b5515b"> The world  interest level in video games over time</span>

In [None]:
interest_V_G=pd.read_csv('/kaggle/input/video-games/multiTimeline.csv')
fig = go.Figure((go.Scatter(x=interest_V_G['Year'],y=interest_V_G['Interest'], name ='The world interest level in video games over time')))
#py.plot(fig,filename='The world interest level in video games over time',auto_open=True)
fig.write_html(r"G:\python\data science\doaa\New folder\Video_Games\world interest.html")
fig

In [None]:
data = {'Region': ['NorthAmerica_Sales', 'Europe_Sales', 'Japan_Sales'], 'Total_sales': [Video_Games['NorthAmerica_Sales'].sum(axis = 0, skipna = True), Video_Games['Europe_Sales'].sum(axis = 0, skipna = True),
                                                                                Video_Games['Japan_Sales'].sum(axis = 0, skipna = True)]}
Region_sales_sum= pd.DataFrame(data)
Region_sales_sum

In [None]:
plt.figure(figsize=(10,5))
sns.barplot( x="Region",y="Total_sales",data=Region_sales_sum, palette='viridis')


In [None]:
Video_Games.Name.value_counts

In [None]:
rank_500=Video_Games.head(500)

In [None]:
rank_500.columns

In [None]:
sns.catplot(x='Platform',y='NorthAmerica_Sales',kind='boxen',data=rank_500.sort_values('Global_Sales'))

## <span style="color:#b5515b"> Regions sales as part of Global Sales by Year </span>

In [None]:
G_S_Year=Video_Games.groupby(['Year'])[['NorthAmerica_Sales', 'Europe_Sales', 'Japan_Sales','Other_Sales',
       'Global_Sales']].sum()
G_S_Year['Year']=G_S_Year.index
G_S_Year.head()

In [None]:
region=['NorthAmerica_Sales', 'Europe_Sales', 'Japan_Sales', 'Other_Sales']
for i in region :
    fig = px.funnel(G_S_Year, y=i,x='Year', color='Global_Sales',title=f'{i} as part of Global Sales by Year')
    #py.plot(fig,filename=f'{i} as part of Global Sales by Year',auto_open=True,)
    fig.show()


In [None]:
G_S_Genre=Video_Games.groupby(['Genre'])[['NorthAmerica_Sales', 'Europe_Sales', 'Japan_Sales','Other_Sales',
       'Global_Sales']].apply(sum)
G_S_Genre['Genre']=G_S_Genre.index
G_S_Genre

## <span style="color:#b5515b"> Regions sales as part of Global Sales by Genre</span>

In [None]:
region=['NorthAmerica_Sales', 'Europe_Sales', 'Japan_Sales', 'Other_Sales']
for i in region :
    fig = px.funnel(G_S_Genre, x=i, y='Genre', color='Global_Sales',title=f'{i} as part of Global Sales by Genre' )
    #py.plot(fig,filename=f'{i} as part of Global Sales by Genre',auto_open=True,)
    fig.show()

In [None]:
Video_Games

In [None]:
Video_Games.head()

In [None]:
df = px.data.iris()
fig = px.scatter_3d(Video_Games, x='Platform', y='Year', z='Global_Sales',
             color='Year' )

fig.show()

In [None]:
df_genre=Video_Games.groupby(by=['Genre'])[['NorthAmerica_Sales', 'Europe_Sales', 'Japan_Sales', 'Other_Sales',
       'Global_Sales']].sum()
df_genre=df_genre.reset_index()
df_genre_sales=df_genre.sort_values(by=['Global_Sales'],ascending=False)
genre_NA=df_genre.sort_values(by=['NorthAmerica_Sales'],ascending=False)
genre_EU=df_genre.sort_values(by=['Europe_Sales'],ascending=False)
genre_JP=df_genre.sort_values(by=['Japan_Sales'],ascending=False)
genre_Other=df_genre.sort_values(by=['Other_Sales'],ascending=False)
df_genre

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x=df_genre_sales['Genre'],y=genre_NA['NorthAmerica_Sales'],name ='The most common Genre in NorthAmerica'))
fig.add_trace(go.Bar(x=df_genre_sales['Genre'],y=genre_EU['Europe_Sales'],name ='The most common Genre in Europe'))
fig.add_trace(go.Bar(x=df_genre_sales['Genre'],y=genre_JP['Japan_Sales'],name ='The most common Genre in japan'))
fig.add_trace(go.Bar(x=df_genre_sales['Genre'],y=genre_Other['Other_Sales'],name ='The most common Genre in resst of world'))
fig.add_trace(go.Bar(x=df_genre_sales['Genre'],y=df_genre_sales['Global_Sales'],name ='The most common Genre globally'))

fig.update_layout(updatemenus= [dict(type = 'buttons',direction = 'right',active = 0,x = 1,y = 1.2,
                                     
                     buttons = list([
                         dict(label = 'North America sales',method = 'update',args = [{'visible': [True,False,False,False,False,False,False,False]}]),
                         dict(label = 'Europe sales',method = 'update',args = [{'visible': [False,True,False,False,False,False,False,False]}]),
                         dict(label = 'Japan  sales',method = 'update',args = [{'visible': [False,False,True,False,False,False,False,False]}]),
                         dict(label = 'Other sales',method = 'update',args = [{'visible': [False,False,False,True,False,False,False,False]}]),
                         dict(label = 'Global Sales',method = 'update',args = [{'visible': [True,True,True,True,True,True,True,True]}])
                                   ])
                                    )
                                 ])
fig.update_layout(title_text = 'Most common Genre by Sales22',barmode = 'stack')


## <span style="color:#b5515b">Which genre game has sold the most in a single year?</span>

In [None]:
df_genre=Video_Games.groupby(by=['Genre'])['Global_Sales'].sum()
df_genre=df_genre.reset_index()
df_genre=df_genre.sort_values(by=['Global_Sales'],ascending=False)

In [None]:
fig1,axes1=plt.subplots(figsize=(15,8))
sns.barplot(x='Genre',y='Global_Sales',data=df_genre,ax=axes1,estimator=np.sum,errcolor='r',capsize=0.2,errwidth=0.75,palette='Spectral')
axes1.set_xlabel('Genre')
axes1.set_ylabel('Global_Sales')
axes1.set_title('Global_Sales VS Genre')
plt.show()


> ### The results 
>- #### <span style="color:red"> ***Action*** , Sports and Shooter are always The Global's best-selling. </span>.
>

### The Platforms percentage

In [None]:
Platforms_value=Video_Games.Platform.value_counts()
other=pd.Series({'Others':Platforms_value[12:].sum()})
Platforms_value1=Platforms_value[:12].append(other)

In [None]:
Platforms_value1.index

In [None]:
plt.pie(Platforms_value1,shadow=True
       ,labels=Platforms_value1.index,
        explode=(.1,0,0,0,0,0,0,0,0,0,0,0,0),
       autopct='%.2f%%' , startangle = 0)
plt.title('Percentage ofPlatforms')
plt.style.use('ggplot')
plt.axis('equal')
plt.title('share of companies')
plt.legend(loc=3,bbox_to_anchor=(1,0.4))



In [None]:
Video_Games.head()

# 1.Numerical Data Ploting

In [None]:
Video_Games.head()

## <span style="color:#b5515b">What year did the games start and when did they stop, depending on the genre?</span>

In [None]:
sns.set_style('whitegrid')
sns.relplot(y='Genre',x='Year',data=Video_Games,aspect=2.5)

> ### The results 
>- <span style="color:red">Sports Games started from 1980 to 2016 then stoped. </span>.
>- <span style="color:red">Simulation games stoped with Sports games and appear again in 2020. </span>.
>- <span style="color:red">strategy games started from 1991 to 2016 . </span>.


## <span style="color:#b5515b">What is the most popular game ?</span>

In [None]:
plt.figure(figsize=(15, 12))
plt.yticks(rotation=45)
sns.countplot(y="Genre", data=Video_Games , order = Video_Games['Genre'].value_counts().index, palette="rocket")

> ### The results 
>- <span style="color:red">Action and Sports Games are the most popular than others .</span>.



In [None]:
Video_Games['Platform'].value_counts()

##  <span style="color:#b5515b">Which platform with the highest price individual game globally?</span>

In [None]:
"""strip", "swarm", "box", "violin",
    "boxen", "point", "bar", or "count"""

In [None]:
for i in range (1000,290,100):
    print (i)
    x=0
sns.catplot(x='Platform', y='Global_Sales',kind='bar',data=Video_Games,estimator=np.sum,palette='Spectral',aspect =3.5,height=20)


In [None]:
sns.relplot(x='Platform',y='Global_Sales',data = Video_Games ,aspect=3.5)

> ### The results 
>- <span style="color:red">WII Sports get the highest price individual game globally  .</span>.

In [None]:
sns.relplot(x='Platform',y='Global_Sales',data= Video_Games,kind='line',hue="Genre",estimator=None,aspect=3.5,height=8)

> ### The results 
>- <span style="color:red">WII Sports get the highest price individual game globally with sports genre  .</span>.

In [None]:
Video_Games.columns

In [None]:
G_S_Year=Video_Games.groupby(by=['Year'])[['NorthAmerica_Sales', 'Europe_Sales', 'Japan_Sales', 'Other_Sales','Global_Sales']].sum()
G_S_Year=G_S_Year.reset_index()
S_Gl_Year=G_S_Year.sort_values(by=['Global_Sales'],ascending=False)
S_NA_Y=G_S_Year.sort_values(by=['NorthAmerica_Sales'],ascending=False)
S_EU_Y=G_S_Year.sort_values(by=['Europe_Sales'],ascending=False)
S_JP_Y=G_S_Year.sort_values(by=['Japan_Sales'],ascending=False)
S_other_Y=G_S_Year.sort_values(by=['Other_Sales'],ascending=False)
G_S_Year

In [None]:
f = go.FigureWidget()
f.add_scatter(x=G_S_Year['Year'],y=G_S_Year['NorthAmerica_Sales'],name='NorthAmerica_Sales')
f.add_scatter(x=G_S_Year['Year'],y=G_S_Year['Europe_Sales'],name='Europe_Sales')
f.add_scatter(x=G_S_Year['Year'],y=G_S_Year['Japan_Sales'],name='Japan_Sales')
f.add_scatter(x=G_S_Year['Year'],y=G_S_Year['Other_Sales'],name='Other_Sales')
f.layout.title = 'Regions sales as part of Global Sales by Year'
f

In [None]:
f.add_scatter(x=G_S_Year['Year'],y=G_S_Year['Global_Sales'],name='Global_Sales')
f

In [None]:
Video_Games.head()

##  <span style="color:#b5515b">Which platform with the highest price individual game globally?</span>

In [None]:
Genre_Sales = Video_Games[['Genre', 'NorthAmerica_Sales', 'Europe_Sales', 'Japan_Sales', 'Other_Sales',]]
# comp_genre
hex_map = Genre_Sales.groupby(by=['Genre']).sum()
# comp_map
hex_map

In [None]:
sns.color_palette("Set2")
fig1,axes1=plt.subplots(figsize=(20,8),)
sns.heatmap(data=hex_map,vmin=100,vmax=630,annot=True,linewidth=0.3,cbar=True, fmt = '.1f', cmap="bone_r")

> ### The results 
>- <span style="color:red">North America is the most popular region for video games without a competitor  .</span>.

##  <span style="color:#b5515b">Years,Global Sales and Publishers of Top 1000 Games</span>

In [None]:
Video_Games.columns

In [None]:
df1000=Video_Games.head(1000)

In [None]:
df1000.head()

In [None]:
df1000["normsales"] = (df1000["Global_Sales"] - np.min(df1000["Global_Sales"]))/(np.max(df1000["Global_Sales"])-np.min(df1000["Global_Sales"]))

In [None]:
df1000.Rank=df1000.Rank.astype("str")
df1000.Global_Sales=df1000.Global_Sales.astype("str")
trace1 = go.Scatter3d(
    y=df1000["Publisher"],
    x=df1000["Year"],
    z=df1000["normsales"],
    text="Name:"+ df1000.Name +","+" Rank:" + df1000.Rank + " Global Sales: " + df1000["Global_Sales"] +" millions",
    mode='markers',
    marker=dict(
        size=df1000['NorthAmerica_Sales'],
        color = df1000['normsales'],
        colorscale = "Rainbow",
        colorbar = dict(title = 'Global Sales'),
        line=dict(color='rgb(140, 140, 170)'),
       
    )
)

data=[trace1]

layout=go.Layout(height=800, width=800, title='Top 1000 Video Games, Release Years, Publishers and Sales',
            titlefont=dict(color='rgb(20, 24, 54)'),
            scene = dict(xaxis=dict(title='Year',
                                    titlefont=dict(color='rgb(20, 24, 54)')),
                            yaxis=dict(title='Publisher',
                                       titlefont=dict(color='rgb(20, 24, 54)')),
                            zaxis=dict(title='Global Sales',
                                       titlefont=dict(color='rgb(20, 24, 54)')),
                            bgcolor = 'whitesmoke'
                           ))
 
fig=go.Figure(data=data, layout=layout)
py.plot(fig,filename='Years,Global Sales and Publishers of Top 1000 Games',auto_open=True)
iplot(fig)

In [None]:
wave_mask= np.array(Image.open("/kaggle/input/controller/controller1.png"))
stopwords = set(STOPWORDS)
stopwords.update(["II", "III"])
plt.subplots(figsize=(15,15))
wordcloud = WordCloud(mask=wave_mask,background_color="lavenderblush",colormap="hsv" ,contour_width=2, contour_color="black",
                      width=950,stopwords=stopwords,
                          height=950
                         ).generate(" ".join(Video_Games.Name))

plt.imshow(wordcloud ,interpolation='bilinear')
plt.axis('off')
plt.savefig('graph.png')

plt.show()

In [None]:
Video_Games.drop(['id'], axis=1)

In [None]:
plt.figure(figsize=(13,10))
sns.heatmap(Video_Games.drop(['id'], axis=1).corr(), cmap = "bone_r", annot=True, linewidth=3)

> ### The results 
>- <span style="color:red">North America is The biggest influencer in the world price without a competitor  .</span>.

##  <span style="color:#b5515b">Machine Learning Models</span>

<a id='4'></a>

In [None]:
Video_Games.columns

In [None]:
# Using labelEncoder convert categorical data into numerical data
columns=[ 'Platform',  'Genre', 'Publisher']
number=LabelEncoder()
for i in columns:
    Video_Games[f'{i}']=number.fit_transform(Video_Games[f'{i}'].astype('str'))

In [None]:
Video_Games1=Video_Games.drop(['Rank',"Name", 'Year','id'],axis=1)
Video_Games3=Video_Games.drop(['NorthAmerica_Sales','Europe_Sales', 'Japan_Sales', 'Other_Sales','id'],axis=1)

In [None]:
Video_Games1

In [None]:
columns=['Platform', 'Genre', 'Publisher', 'NorthAmerica_Sales','Europe_Sales']

In [None]:
labels=Video_Games3['Global_Sales'].values
features=Video_Games1[list(columns)].values
x=features
y=labels

In [None]:
y

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30)

In [None]:
print(x_train.size,x_test.size,y_train.size,y_test.size)

##  <span style="color:#b5515b">Scaling</span>

In [None]:
scaler=StandardScaler()

In [None]:
# Fit only on training data 
scaler.fit(x_train)
x_train=scaler.transform(x_train)

In [None]:
# apply same transformation to test data
x_test = scaler.transform(x_test)

In [None]:
x_test.size,

In [None]:
y_test.size

##  <span style="color:#b5515b">Linear Regression Model</span>

In [None]:
#training model
regr = linear_model.LinearRegression()
regr.fit(x_train,y_train)
accuracy=regr.score(x_train,y_train)
y_pred=regr.predict(x_test)
print('Linear Regression Accuracy in the training data :' , accuracy*100 , "%")
Test_accuracy=regr.score(x_test,y_test)
print('Linear Regression Accuracy in the test data :' , Test_accuracy*100 , "%")

In [None]:
cutoff = 0.7                              # decide on a cutoff limit
y_pred_classes = np.zeros_like(y_pred)    # initialise a matrix full with zeros
y_pred_classes[y_pred > cutoff] = 1            # add a 1 if the cutoff was breached                   
y_test_classes = np.zeros_like(y_pred)
y_test_classes[y_test > cutoff] = 1
accuracy_score(y_test_classes,y_pred_classes)

In [None]:
confusion_matrix(y_test_classes,y_pred_classes)

In [None]:
recall_score(y_test_classes,y_pred_classes)
precision_score(y_test_classes,y_pred_classes,average=None)
f1_score(y_test_classes,y_pred_classes,average=None)
print(classification_report(y_test_classes,y_pred_classes))

##  <span style="color:#b5515b">GradientBoostingRegressor Model</span>

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBR=GradientBoostingRegressor()
GBR.fit(x_train,y_train)
GBR_accuracy=GBR.score(x_train,y_train)
y_pred=GBR.predict(x_test)
print('GradientBoostingRegressor Accuracy in the training data :' , GBR_accuracy*100 , "%%!!")
GBR_Test_accuracy=GBR.score(x_test,y_test)
print('GradientBoostingRegressor Accuracy in the test data :' , GBR_Test_accuracy*100 , "%")

##  <span style="color:#b5515b">DecisionTreeRegressor Model</span>

In [None]:
from sklearn.tree import DecisionTreeRegressor
DT=DecisionTreeRegressor()
DT.fit(x_train,y_train)
DT_accuracy=DT.score(x_train,y_train)
y_pred=DT.predict(x_test)
print('DecisionTree Accuracy in the training data :' , DT_accuracy*100 , "%")
DT_Test_accuracy=DT.score(x_test,y_test)
print('DecisionTree Accuracy in the test data :' , DT_Test_accuracy*100 , "%")

In [None]:
cutoff = 0.7                              # decide on a cutoff limit
y_pred_classes = np.zeros_like(y_pred)    # initialise a matrix full with zeros
y_pred_classes[y_pred > cutoff] = 1            # add a 1 if the cutoff was breached                   
y_test_classes = np.zeros_like(y_pred)
y_test_classes[y_test > cutoff] = 1
accuracy_score(y_test_classes,y_pred_classes)

In [None]:
confusion_matrix(y_test_classes,y_pred_classes)

In [None]:
recall_score(y_test_classes,y_pred_classes)
precision_score(y_test_classes,y_pred_classes,average=None)
f1_score(y_test_classes,y_pred_classes,average=None)
print(classification_report(y_test_classes,y_pred_classes))

##  <span style="color:#b5515b">RandomForestRegressor Model</span>

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF=RandomForestRegressor()
RF.fit(x_train,y_train)
RF_accuracy=RF.score(x_train,y_train)
print('RandomForest Accuracy in the training data :' , RF_accuracy*100 , "%")
RF_Test_accuracy=RF.score(x_test,y_test)
print('RandomForest Accuracy in the test data :' , RF_Test_accuracy*100 , "%")

##  <span style="color:#b5515b">SVR Model</span>

In [None]:
from sklearn.svm import SVR
SVR=SVR()
SVR.fit(x_train,y_train)
y_pred=SVR.predict(x_test)
SVR_accuracy=SVR.score(x_train,y_train)
print('SVR Accuracy in the training data :' , SVR_accuracy*100 , "%")
SVR_Test_accuracy=SVR.score(x_test,y_test)
print('SVR Accuracy in the test data :' , SVR_Test_accuracy*100 , "%")

##  <span style="color:#b5515b">Machine learning methods visualization</span>

In [None]:
data = {'Model Name':['Linear Regression','GradientBoosting', 'DecisionTree', 'RandomForest', 'SVR'], 
        'training data Accuracy':[accuracy,GBR_accuracy, DT_accuracy, RF_accuracy, SVR_accuracy], 
        'testing data Accuracy':[Test_accuracy,GBR_Test_accuracy, DT_Test_accuracy, RF_Test_accuracy, SVR_Test_accuracy]}
Ms=pd.DataFrame(data)
Ms

In [None]:
table = ff.create_table(Ms)
for i in range(len(table.layout.annotations)):
    table.layout.annotations[i].font.size = 9
iplot(table)
py.plot(table,filename='Models Accuracy',auto_open=True)

In [None]:
fig = px.bar(Ms, x='Model Name', y='training data Accuracy',color='testing data Accuracy',color_continuous_scale='blues')

fig.show()
py.plot(fig,filename='Models accuracy',auto_open=True)

# <span style="color:#AE5FE8">Thank You</span>

In [None]:
sns.palplot(sns.color_palette('hls',8))