<h3 style="background-color:orange;font-family:newtimeroman;font-size:300%;text-align:center;border-radius: 15px 50px;">Libraries and Utilities</h3>



In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from wordcloud import WordCloud,STOPWORDS
import spacy as sp
import string
import nltk
import re
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
pyo.init_notebook_mode()
nltk.download('vader_lexicon')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.manifold import Isomap,TSNE
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
nlps = sp.load('en')
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from keras.layers import Dense,LSTM,Input,Dropout,SimpleRNN
from keras import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from tqdm.notebook import tqdm
from tqdm.keras import TqdmCallback

%matplotlib inline


plt.rc('figure',figsize=(18,11))
sns.set_context('paper',font_scale=2)


<h3 style="background-color:orange;font-family:newtimeroman;font-size:300%;text-align:center;border-radius: 15px 50px;">Initial Data Assessment</h3>



In [None]:
gba_df = pd.read_csv('/kaggle/input/1500-gameboy-advanced-games-information/GBA_Games.csv')
gba_df.head(3)

In [None]:
plt.title('Missing Value Count by Column')
missing = gba_df.isna().sum().to_frame().rename(columns={0:'count'})['count'].sort_values(ascending=False).to_frame()
sns.heatmap(missing,annot=True,fmt='d',cmap='vlag')
plt.show()

<h3 style="background-color:orange;font-family:newtimeroman;font-size:300%;text-align:center;border-radius: 15px 50px;">Exploratory Data Analysis</h3>



In [None]:
Region_Related = [t.strip() for item in gba_df['Region Released'].str.split(',').to_list() if type(item) != float for t in item ]
Region_Related = pd.Series(Region_Related).value_counts()
pal = sns.color_palette("coolwarm", len(Region_Related))[::-1]
plt.title('Distribution of Realeses by Region')
sns.barplot(x=Region_Related.index,y=Region_Related.values,palette=pal)
plt.show()

**Observation**: The PAL region is a television publication territory that covers most of Asia, Africa, Europe, South America, and Oceania,
It seems that the Pal region has slightly more games in comparison to the next "giant," which is JAP - the Japanese region.

In [None]:
Developers = gba_df.Developers.value_counts()[:30]
plt.title('Distribution of Game Releases by Top 30 Developer')
pal = sns.color_palette("coolwarm", len(Developers))
ax = sns.barplot(x=Developers.index,y=Developers.values,palette=pal[::-1])
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.show()

**Observation**: Konami is by far the most iconic game developer company, as seen in the plot above, holding first place with a staggering amount of 48 games released!

In [None]:
Region_Related = [t.strip() for item in gba_df['Publishers'].str.split('/').to_list() if type(item) != float for t in item ]
Region_Related = pd.Series(Region_Related).value_counts()[:30]
pal = sns.color_palette("coolwarm", len(Region_Related))
plt.title('Games Releases of Top 30 Publishers')
ax= sns.barplot(x=Region_Related.index,y=Region_Related.values,palette=pal[::-1])
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.show()

**Observation**:  Apparently, Konami leads in the game development category and rules the market as the number one game publisher! We can see that 1 in 10 Gameboy advanced games that were produced between 2002 and 2008 are published by Konami!

In [None]:
release_yeras = gba_df['Year Released'].apply(lambda x: max(re.findall(r'2[0-9]{3}',x.strip())) if type(x) != float else 'nan' ).value_counts().drop(index='nan')
pal = sns.color_palette("coolwarm", len(release_yeras))
plt.title('Yearly Amount of Games Realsed')
ax = sns.barplot(x=release_yeras.index,y=release_yeras.values,palette=pal[::-1])
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.show()


**Observation**:  the Gameboy advanced was first released on June 11, 2001, and not so surprisingly, we see that the peak of game releases was in 2002, from which a steady decline in-game production continues until 2008 where the last document game was released.

In [None]:
multiplayer_status = gba_df['Multiplayer'].apply(lambda x: re.findall(r'No|Yes',x)[0] if type(x) != float else 'nan')
ex.pie(values=multiplayer_status.value_counts().values,names=multiplayer_status.value_counts().index,title='Proportion of Multiplayer Statuses')

<h3 style="background-color:orange;font-family:newtimeroman;font-size:300%;text-align:center;border-radius: 15px 50px;">Data Preprocessing</h3>



In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#Preprocessing Titles
#Extracting Words Using Regex and Converting To Lowercase
titles         = gba_df.Titles.str.lower().apply(lambda x: ' '.join([i for i in re.findall(r'[a-zA-Z]+',x) if len(i) > 2]))
#Creating A Singal Text Block
titles_text    = ' '.join(titles)
#Removing Redundent Words Leaked During Scrapping
titles_text    = titles_text.replace('PAL','')
titles_text    = titles_text.replace('JP','')
#Creating A Tonken Array of Lemmatized and Stemmed Words
title_tokens   = nltk.FreqDist([lemmatizer.lemmatize(stemmer.stem(i)) for i in titles_text.split(' ') if i not in STOPWORDS])

In [None]:
#Preprocessing Titles
#Extracting Words Using Regex and Converting To Lowercase
gameplay        = gba_df.Gameplay[gba_df.Gameplay.notna()].str.lower().apply(lambda x: ' '.join([i for i in re.findall(r'[a-zA-Z]+',x) if len(i) > 2]))
#Creating A Singal Text Block
gameplay_text   = ' '.join(gameplay)
#Removing Redundent Words Leaked During Scrapping
gameplay_text   = gameplay_text.replace('PAL','')
gameplay_text   = gameplay_text.replace('JP','')
#Creating A Tonken Array of Lemmatized and Stemmed Words
gameplay_tokens = nltk.FreqDist([lemmatizer.lemmatize(stemmer.stem(i)) for i in gameplay_text.split(' ') if i not in STOPWORDS])

In [None]:
#Preprocessing Titles
#Extracting Words Using Regex and Converting To Lowercase
plot        = gba_df.Plot[gba_df.Plot.notna()].str.lower().apply(lambda x: ' '.join([i for i in re.findall(r'[a-zA-Z]+',x) if len(i) > 2]))
#Creating A Singal Text Block
plot_text   = ' '.join(plot)
#Removing Redundent Words Leaked During Scrapping
plot_text   = gameplay_text.replace('PAL','')
plot_text   = gameplay_text.replace('JP','')
#Creating A Tonken Array of Lemmatized and Stemmed Words
plot_text   = nltk.FreqDist([lemmatizer.lemmatize(stemmer.stem(i)) for i in plot_text.split(' ') if i not in STOPWORDS])

<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Game Titles</h3>



In [None]:
NUMBER_OF_COMPONENTS = 450

SVD = TruncatedSVD(NUMBER_OF_COMPONENTS)

text_data = titles
text_data = text_data.apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
text_data = text_data.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x) if word not in STOPWORDS]))

CV = CountVectorizer(stop_words='english',ngram_range=(1,1))
cv = CV.fit_transform(text_data)

C_vector = cv

pc_matrix = SVD.fit_transform(C_vector)

evr = SVD.explained_variance_ratio_
total_var = evr.sum() * 100
cumsum_evr = np.cumsum(evr)

trace1 = {
    "name": "individual explained variance", 
    "type": "bar", 
    'y':evr}
trace2 = {
    "name": "cumulative explained variance", 
    "type": "scatter", 
     'y':cumsum_evr}
data = [trace1, trace2]
layout = {
    "xaxis": {"title": "Principal components"}, 
    "yaxis": {"title": "Explained variance ratio"},
  }
fig = go.Figure(data=data, layout=layout)
fig.update_layout(     title='{:.2f}% of the Variance in the Titles Can Be Explained Using {} Words'.format(np.sum(evr)*100,NUMBER_OF_COMPONENTS))
fig.show()

**Observation**:  As for inner variability in the Gameboy advanced game titles, we see that we need at least 450 words (dimensions) to describe roughly 80 percent of the variance in the original dimension.
The insight provided from this test tells us that the game titles' inner variance is very high and that many different names are "far" away from each other, meaning there is no resemblance between,any of the names.

In [None]:
best_fearures = [[CV.get_feature_names()[i],SVD.components_[0][i]] for i in SVD.components_[0].argsort()[::-1]]
worddf = pd.DataFrame(np.array(best_fearures[:NUMBER_OF_COMPONENTS])[:,0]).rename(columns={0:'Word'})
worddf['Explained Variance'] =  np.round(evr*100,2)
worddf['Explained Variance'] =worddf['Explained Variance'].apply(lambda x:str(x)+'%')
app = []
for word in worddf.Word:
    total_count = 1
    for tweet in text_data:
        if tweet.find(word)!= -1:
            total_count+=1
    app.append(total_count)
worddf['Appeared_On_X_Tweets'] = app
worddf

fig = go.Figure()
fig.add_trace(
    go.Table(
        header=dict(
            values=['<b>Word<b>',"<b>Accountable For X% of Variance<b>",'<b>Appeared On X Reviews<b>'],
            font=dict(size=19,family="Lato"),
            align="center"
        ),
        cells=dict(
            values=[worddf[k].tolist() for k in ['Word',"Explained Variance",'Appeared_On_X_Tweets']],
            align = "center")
    ),
    
)

fig.show()

In [None]:
NUMBER_OF_COMPONENTS = 2
isomap = TSNE(NUMBER_OF_COMPONENTS)
pc_matrix = isomap.fit_transform(C_vector)

dec_df = gba_df.copy()
dec_df = dec_df.assign(dim_1 = pc_matrix[:,0],dim_2 = pc_matrix[:,1])
dec_df = dec_df.assign(RY = gba_df['Year Released'].apply(lambda x: max(re.findall(r'2[0-9]{3}',x.strip())) if type(x) != float else -1 ))
ex.scatter(dec_df,x='dim_1',y='dim_2',title=r'Gameboy Title Projected From R^2734 --> R^2',color='RY',
          hover_data=['Titles'])

**Observation**:  We can see the high variance we concluded in the previous plot come into play when we inspect that data from a lower dimension perspective.
We see that there are mini clusters formed contacting different game installments from the same series but overall, the graph does not tell us any interesting relationship between the games.

<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Game Gameplay Description</h3>



In [None]:
NUMBER_OF_COMPONENTS = 100

SVD = TruncatedSVD(NUMBER_OF_COMPONENTS)

text_data = gameplay
text_data = text_data.apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
text_data = text_data.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x) if word not in STOPWORDS]))

CV = CountVectorizer(stop_words='english',ngram_range=(1,1))
cv = CV.fit_transform(text_data)

C_vector = cv

pc_matrix = SVD.fit_transform(C_vector)

evr = SVD.explained_variance_ratio_
total_var = evr.sum() * 100
cumsum_evr = np.cumsum(evr)

trace1 = {
    "name": "individual explained variance", 
    "type": "bar", 
    'y':evr}
trace2 = {
    "name": "cumulative explained variance", 
    "type": "scatter", 
     'y':cumsum_evr}
data = [trace1, trace2]
layout = {
    "xaxis": {"title": "Principal components"}, 
    "yaxis": {"title": "Explained variance ratio"},
  }
fig = go.Figure(data=data, layout=layout)
fig.update_layout(     title='{:.2f}% of the Variance in the Gameplay Description Can Be Explained Using {} Words'.format(np.sum(evr)*100,NUMBER_OF_COMPONENTS))
fig.show()

**Observation**:  Very interesting fact we learn from the above plot, apparently we can describe more than 80 percent of the original variance in Gameboy game gameplay description using just 100 words (dimensions), indicating that there are not that many different and unique gameplays when it comes done the Gameboy advanced games.

In [None]:
best_fearures = [[CV.get_feature_names()[i],SVD.components_[0][i]] for i in SVD.components_[0].argsort()[::-1]]
worddf = pd.DataFrame(np.array(best_fearures[:NUMBER_OF_COMPONENTS])[:,0]).rename(columns={0:'Word'})
worddf['Explained Variance'] =  np.round(evr*100,2)
worddf['Explained Variance'] =worddf['Explained Variance'].apply(lambda x:str(x)+'%')
app = []
for word in worddf.Word:
    total_count = 1
    for tweet in text_data:
        if tweet.find(word)!= -1:
            total_count+=1
    app.append(total_count)
worddf['Appeared_On_X_Tweets'] = app
worddf

fig = go.Figure()
fig.add_trace(
    go.Table(
        header=dict(
            values=['<b>Word<b>',"<b>Accountable For X% of Variance<b>",'<b>Appeared On X Reviews<b>'],
            font=dict(size=19,family="Lato"),
            align="center"
        ),
        cells=dict(
            values=[worddf[k].tolist() for k in ['Word',"Explained Variance",'Appeared_On_X_Tweets']],
            align = "center")
    ),
    
)

fig.show()

In [None]:
NUMBER_OF_COMPONENTS = 2
isomap = TSNE(NUMBER_OF_COMPONENTS)
pc_matrix = isomap.fit_transform(C_vector)

dec_df = gba_df[gba_df.Gameplay.notna()]
dec_df = dec_df.assign(dim_1 = pc_matrix[:,0],dim_2 = pc_matrix[:,1])
dec_df = dec_df.assign(RY = gba_df['Year Released'].apply(lambda x: max(re.findall(r'2[0-9]{3}',x.strip())) if type(x) != float else -1 ))
dec_df = dec_df[dec_df.Titles != 'Snood']
ex.scatter(dec_df,x='dim_1',y='dim_2',title=r'Gameboy Gameplay Description Projected From R^7154 --> R^2',color='RY',
          hover_data=['Titles'])

**Observation**:  Looking at the spread of the gameplay description project onto R^2 we see that there are many more mini clusters appearing and visually, there are games that are close together because they are from the same series but notice that there is a cluster containing : ('harry potter quidditch, 'NFL' and 'Soccer Game' ) and overall sports games are more closer together action/arcade games are closer together, in comparison to the Titles this information gives us very interesting connection between different games based on their gameplay, a very interesting and effective recommendation system can be created based on this 2 dimension and KNN in my opinion.


<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Game Plot Description</h3>



In [None]:
NUMBER_OF_COMPONENTS = 100

SVD = TruncatedSVD(NUMBER_OF_COMPONENTS)

text_data = plot
text_data = text_data.apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
text_data = text_data.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x) if word not in STOPWORDS]))

CV = CountVectorizer(stop_words='english',ngram_range=(1,1))
cv = CV.fit_transform(text_data)

C_vector = cv

pc_matrix = SVD.fit_transform(C_vector)

evr = SVD.explained_variance_ratio_
total_var = evr.sum() * 100
cumsum_evr = np.cumsum(evr)

trace1 = {
    "name": "individual explained variance", 
    "type": "bar", 
    'y':evr}
trace2 = {
    "name": "cumulative explained variance", 
    "type": "scatter", 
     'y':cumsum_evr}
data = [trace1, trace2]
layout = {
    "xaxis": {"title": "Principal components"}, 
    "yaxis": {"title": "Explained variance ratio"},
  }
fig = go.Figure(data=data, layout=layout)
fig.update_layout(     title='{:.2f}% of the Variance in the Game Plots Can Be Explained Using {} Words'.format(np.sum(evr)*100,NUMBER_OF_COMPONENTS))
fig.show()

In [None]:
best_fearures = [[CV.get_feature_names()[i],SVD.components_[0][i]] for i in SVD.components_[0].argsort()[::-1]]
worddf = pd.DataFrame(np.array(best_fearures[:NUMBER_OF_COMPONENTS])[:,0]).rename(columns={0:'Word'})
worddf['Explained Variance'] =  np.round(evr*100,2)
worddf['Explained Variance'] =worddf['Explained Variance'].apply(lambda x:str(x)+'%')
app = []
for word in worddf.Word:
    total_count = 1
    for tweet in text_data:
        if tweet.find(word)!= -1:
            total_count+=1
    app.append(total_count)
worddf['Appeared_On_X_Tweets'] = app
worddf

fig = go.Figure()
fig.add_trace(
    go.Table(
        header=dict(
            values=['<b>Word<b>',"<b>Accountable For X% of Variance<b>",'<b>Appeared On X Reviews<b>'],
            font=dict(size=19,family="Lato"),
            align="center"
        ),
        cells=dict(
            values=[worddf[k].tolist() for k in ['Word',"Explained Variance",'Appeared_On_X_Tweets']],
            align = "center")
    ),
    
)

fig.show()

In [None]:
NUMBER_OF_COMPONENTS = 2
isomap = TSNE(NUMBER_OF_COMPONENTS)
pc_matrix = isomap.fit_transform(C_vector)

dec_df = gba_df[gba_df.Plot.notna()]
dec_df = dec_df.assign(dim_1 = pc_matrix[:,0],dim_2 = pc_matrix[:,1])
dec_df = dec_df.assign(RY = gba_df['Year Released'].apply(lambda x: max(re.findall(r'2[0-9]{3}',x.strip())) if type(x) != float else -1 ))
ex.scatter(dec_df,x='dim_1',y='dim_2',title=r'Gameboy Plot Description Projected From R^9378 --> R^2',color='RY',
          hover_data=['Titles'])

<h3 style="background-color:orange;font-family:newtimeroman;font-size:300%;text-align:center;border-radius: 15px 50px;">Generating New Gampeplay Formats</h3>



In [None]:
titles = gba_df.Gameplay[gba_df.Gameplay.notna()]
vocab = list(nltk.FreqDist(' '.join(titles).split(' ')).keys())
vocab_size = len(vocab)
word_index = { ch:i for i,ch in enumerate(vocab) }
index_word = { i:ch for i,ch in zip(word_index.values(),word_index.keys())}
FTD = ' '.join(titles).split(' ')

In [None]:
S = []  
C = []
stride = 25
T = 25
for i in range(0, len(FTD) - T, stride):
    S.append(FTD[i: i + T])
    C.append(FTD[i + T])
X = np.zeros((len(S), T, vocab_size), dtype='bool')
Y = np.zeros((len(S), vocab_size), dtype='bool')    
for i, seq in tqdm(enumerate(S)):
    for t, char in enumerate(seq):
        X[i, t, word_index[char]] = 1
        Y[i, word_index[C[i]]] = 1

In [None]:
lstm_nn = Sequential()
lstm_nn.add(Input((T,vocab_size)))
lstm_nn.add(LSTM(64))
lstm_nn.add(Dropout(0.5))
lstm_nn.add(Dense(vocab_size,activation='softmax'))

lstm_nn.compile(optimizer='adam',loss='categorical_crossentropy')
lstm_nn.summary()

In [None]:
history = lstm_nn.fit(X, Y, epochs=500, batch_size=128,verbose=0, callbacks=[TqdmCallback(verbose=0)])

In [None]:
plt.plot(history.history['loss'],'.-')
plt.ylabel('loss',fontsize=14)
plt.show()

In [None]:
generated = []
for itr in tqdm(range(0,10)):
    start = np.random.randint(0, len(X)-1)
    input_buffer = X[start] 
    generated_text = S[start].copy()

    for i in (range(10)):
        yhat = lstm_nn.predict(input_buffer[None,:])[0]
        #ix = np.argmax(yhat)
        ix = np.random.choice(range(vocab_size), p=yhat)

        ch = index_word[ix]
        generated_text += [ch]
        input_buffer = np.r_[input_buffer[1:,:], np.zeros((1,vocab_size))]
        input_buffer[-1,ix] = 1

    generated.append(' '.join(generated_text))

In [None]:
for idx,text in enumerate(generated):
    print(idx+1,') ',text)