In [8]:
import os

import numpy as np
from matplotlib import pyplot as plt
import matplotlib.dates as mdates

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

import string

import nltk

from operator import itemgetter

import pandas as pd

import datetime

import seaborn as sns

In [2]:
data = pd.read_csv('appstore_games.csv', encoding = "ISO-8859-1")

In [3]:
description = data["Description"]
keywords = []

#remove capital letters and punctuation
for i in range(len(description)):
    s = description[i].lower().replace("\\n",' ').translate(str.maketrans('', '', string.punctuation)).split()
    keywords.append(s)


In [4]:
flat_keywords = [item for sublist in keywords for item in sublist]

#remove numbers and words containing numbers
flat_keywords = [x for x in flat_keywords if not any(c.isdigit() for c in x)]

keywords_set = set(flat_keywords)
flat_keywords[0:10]

['join', 'over', 'of', 'our', 'fans', 'and', 'download', 'one', 'of', 'our']

In [5]:
def most_common_by_POS(word_list, POS, output_len):
    """This function takes a list of non-unique words and a part of speach tag (e.i. noun "N" or adjective "J")
    and returns a list of tuples with the most common words and their number of appearences in the word_list."""
   
    keyword_set = set(word_list)
    keywords_str = " ".join(keywords_set)
    selected_POS = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(keywords_str)) if pos[0] == POS]
    
    selected_set = set(selected_POS)
    
    word_count = {}
    for i in word_list:
        if i in selected_set:
            word_count[i] = word_count.get(i, 0 ) + 1
    
    output_list = sorted(list(word_count.items()), key=itemgetter(1), reverse=True)
    
    if len(selected_POS) < output_len:
        return output_list
    else:
        return output_list[:output_len]
            
        
    

Here we clean the data to remove all stop words and words containing numbers. We also make all words lower case and remove punctuation. This creates a "bag of words" for each description.

In [73]:
cleaned_descriptions = [i.lower().replace("\\n",' ').translate 
                        (str.maketrans('', '', string.punctuation)) for i in description]
no_digits = []
for doc in cleaned_descriptions:
    no_digits.append(' '.join(word for word in doc.split() if not any(c.isdigit() for c in word)))

stop_words = set(nltk.corpus.stopwords.words('English'))

clean = []

for doc in no_digits:
    cleaned_doc = ''
    for word in doc.split():
        if word not in stop_words:
            cleaned_doc += ' ' + word
    clean.append(cleaned_doc)

bow = clean

['Sudoku',
 'Reversi',
 'Morocco',
 'Sudoku (Free)',
 'Senet Deluxe',
 'Sudoku - Classic number puzzle',
 'Gravitation',
 'Colony',
 'Carte',
 '"Barrels O\' Fun"',
 'Quaddraxx',
 'Lumen Lite',
 'BubblePop',
 'Marple',
 'Tetravex Lite',
 'Awele/Oware - Mancala HD',
 'Awele/Oware - Mancala HD',
 'Chess Game',
 'Catcha Mouse',
 'Cool Sudoku, Jigsaw, Killer',
 'Mind the Corners',
 'All You Can Eat',
 'Ane Rouge',
 'Neiscat',
 'Boomshine',
 'Gaia Lite',
 'Fieldrunners',
 'Lux Touch 3 - World Domination',
 '"Don\'t Square"',
 'Smart Rummy',
 'Expert Sudoku',
 'Edgewise',
 'Chess Genius',
 'Chess - tChess Pro',
 'Toobz',
 'Color Sudoku',
 'Chess - tChess Lite',
 'Checkers Online Lite',
 'TapDefense',
 'Chinese Chess (Xiangqi)',
 'Warfare Incorporated',
 'Strategery',
 '5x5 Shogi (MiniShogi) K55',
 'Deep Green Chess',
 'Jewel Lines',
 'Lux DLX 3 - Map Conquest Game',
 'ChartFight SP',
 'Robo Logic',
 'RoboLogic Lite',
 'Kakinoki Shogi (Japanese Chess)',
 'Kings Corners Free',
 'Flip Ninja',
 '

In [79]:
bow_dict = { name: [doc] for (name ,doc) in list(zip(data["Name"], bow))}
Name = list(bow_dict.keys())

bow_df = pd.DataFrame.from_dict(bow_dict).transpose()
bow_df.columns = ["Description"]
bow_df["Name"] = Name
#bow_df = bow_df.sort_index()
bow_df

ValueError: Length mismatch: Expected axis has 16847 elements, new values have 1 elements

Here we use sklearn to create a document term matrix in a pandas dataframe.

In [58]:
vec = CountVectorizer()
count_array = vec.fit_transform(bow_df["Description"])
doc_term = pd.DataFrame(count_array.toarray(), columns=vec.get_feature_names())

In [59]:
doc_term.head(10)

Unnamed: 0,aa,aaa,aaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaa,aab,aachen,aadu,aaduction,aah,aaiy,...,zxfcrich,zyklus,zynatook,zynga,zyrobotics,zyzzyvas,zzap,zzed,zzz,zzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
term_doc = doc_term.transpose()
term_doc.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16837,16838,16839,16840,16841,16842,16843,16844,16845,16846
aa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaaaaaa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaaaaaaaaaaaaaaaaaaaaaaaa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
data['Name']

0                                Sudoku
1                               Reversi
2                               Morocco
3                         Sudoku (Free)
4                          Senet Deluxe
5        Sudoku - Classic number puzzle
6                           Gravitation
7                                Colony
8                                 Carte
9                      "Barrels O' Fun"
10                            Quaddraxx
11                           Lumen Lite
12                            BubblePop
13                               Marple
14                        Tetravex Lite
15             Awele/Oware - Mancala HD
16             Awele/Oware - Mancala HD
17                           Chess Game
18                         Catcha Mouse
19          Cool Sudoku, Jigsaw, Killer
20                     Mind the Corners
21                      All You Can Eat
22                            Ane Rouge
23                              Neiscat
24                            Boomshine


In [7]:
keywords_str = 'the cat enjoyed the warmer weather.'
selected_POS = [(word, pos) for (word, pos) in nltk.pos_tag(nltk.word_tokenize(keywords_str))]
selected_POS

[('the', 'DT'),
 ('cat', 'NN'),
 ('enjoyed', 'VBD'),
 ('the', 'DT'),
 ('warmer', 'NN'),
 ('weather', 'NN'),
 ('.', '.')]