In [None]:
# https://www.kaggle.com/terminus7/pokemon-challenge

import pandas as pd
import numpy as np
import datetime

from IPython.display import display

## Load data

In [None]:
combats = pd.read_csv('combats.csv')
pokemon = pd.read_csv('pokemon.csv', index_col=0)

In [None]:
# lets add dates to show some possibilities
startdate = datetime.datetime(2017, 1, 14)
random_dates = [startdate + 
                datetime.timedelta(np.random.randint(1, 365))
                for _ in range(combats.shape[0])]
combats['date'] = random_dates

In [None]:
combats.dtypes

In [None]:
pokemon.head()

In [None]:
combats.tail()

## DataFrame vs Series

In [None]:
# df
pokemon[['HP', 'Attack']].head()

In [None]:
# series
pokemon.HP.head() # or pokemon['HP'].head()

In [None]:
# df
pokemon[['HP']].head()

## Basic statistics

In [None]:
pokemon.describe()

In [None]:
pokemon.mean() # median, std, quantile ...

In [None]:
pokemon.HP.quantile(q=0.9)

In [None]:
display(pokemon.isnull().head())
pokemon.isnull().sum()

In [None]:
pokemon[pokemon.Name.isnull()]

## Access data

In [None]:
display(pokemon.loc[1])
display(pokemon.iloc[0])

In [None]:
display(pokemon.loc[1, 'Name'])
display(pokemon.iloc[0, 0])
display(pokemon.iloc[0]['Name'])
display(pokemon.iloc[0].Name)

## Select data

In [None]:
pokemons_with_big_hp = pokemon.HP > pokemon.HP.quantile(q=0.9)

display(pokemons_with_big_hp.head())

display(pokemon[pokemons_with_big_hp].head())

In [None]:
pokemons_with_big_hp_and_attack = (pokemon.HP > pokemon.HP.quantile(q=0.9)) & \
                                  (pokemon.Attack > pokemon.Attack.quantile(q=0.9))

display(pokemons_with_big_hp_and_attack.head())

display(pokemon[pokemons_with_big_hp_and_attack].head())

In [None]:
pokemon.query('HP > 100 & Attack > 100').head()

## String data

In [None]:
pokemon[pokemon.Name.isnull()]

In [None]:
pokemon[pokemon.Name.str.endswith('aur', na=False)]

In [None]:
# are pokemons with similar naming are similar

In [None]:
pokemon.Name.str.slice(-3).value_counts().head(10)

In [None]:
pokemon['name_ending'] = pokemon.Name.str.slice(-3)

In [None]:
pokemon.Name.str.split().head()

In [None]:
pokemon['n_words_name'] = pokemon.Name.str.split().str.len()

In [None]:
pokemon.Name.str.split(expand=True).head()

In [None]:
pokemon.Name.str.partition().head()

In [None]:
string = 'x = a + b + c'
print(string.split('+'))
print(string.split('+', 1))
print(string.partition('+'))

print(string.rsplit('+', 1))
print(string.rpartition('+'))

In [None]:
pokemon.n_words_name.value_counts()

In [None]:
pokemon[pokemon.n_words_name == 3].head(10)

## Custom functions

In [None]:
pokemon[pokemon.Name.apply(lambda x: x.endswith('aur') if pd.notnull(x) else False)]

In [None]:
#  actually maybe a bit better way is

def check_ending(word, postfix):
    if pd.isnull(word):
        return False
    return word.endswith(postfix)


pokemon[pokemon.Name.apply(lambda x: check_ending(x, 'aur'))]

In [None]:
from functools import partial

check_ending_aur = partial(check_ending, postfix='aur')
pokemon[pokemon.Name.apply(check_ending_aur)]

In [None]:
display((pokemon.HP + pokemon.Attack).head())

display(pokemon.apply(lambda x: x.HP + x.Attack, axis=1).head())

## Dates

In [None]:
combats['month'] = combats.date.dt.month # week, our, dayofweek ... (look docs)
combats.head()

In [None]:
current_date = pd.datetime.now()
combats['time_since_fight'] = current_date - combats.date
print(current_date)
combats.head()

In [None]:
combats.dtypes

In [None]:
combats.time_since_fight.dt.days.head()

In [None]:
%%timeit
combats.time_since_fight.dt.days

In [None]:
%%timeit
combats.time_since_fight.astype('timedelta64[D]')

# !!!!!!! much faster

## Sorting

In [None]:
pokemon.sort_values(by=['Sp. Atk', 'Sp. Def'], ascending=False).head(10)

## Group By

In [None]:
display(pokemon.groupby(['Type 1']).HP.mean())

display(pokemon.groupby(['Type 1'])['HP'].mean())

display(pokemon.groupby(['Type 1']).agg({'HP': 'mean'}))

display(pokemon.groupby(['Type 1']).agg({'HP': np.mean}))

In [None]:
pokemon.groupby(['Type 1']).agg({'HP': 'mean'}).round(2).sort_values(by='HP')

In [None]:
# useless
pokemon.groupby('Type 1').head(1)

In [None]:
pokemon = pokemon.sort_values(by='Attack', ascending=False)
pokemon.groupby('Type 1').head(1)

In [None]:
pokemon.groupby(['Type 1', 'Type 2']).agg({'HP': 'mean'}).round(2).sort_values(by='HP', ascending=False)

In [None]:
agg_funcs = {
    'HP': 'mean',
    'Name': 'count',
    'Attack': ['min', 'max', 'median'],
    'Speed': ['std', lambda x: x.sum()],
    'Defense': lambda x: np.percentile(x, 90)
}

pokemon_stats = pokemon.groupby(['Type 1', 'Type 2']).agg(agg_funcs).round(2)

pokemon_stats.head()

## Index

In [None]:
print(pokemon_stats.columns)
print(pokemon_stats.index[:5])

In [None]:
pokemon_stats.sort_values(by=[('HP', 'mean')], ascending=False).head()

## Joins

In [None]:
combats['Winner_first'] = combats.First_pokemon == combats.Winner
combats.head()

In [None]:
pokemon['pokeindex'] = pokemon.index

In [None]:
display(pokemon.pokeindex.describe())

display(combats.First_pokemon.describe())

In [None]:
df = combats.merge(pokemon, left_on='First_pokemon', right_on='pokeindex')

In [None]:
df = df.merge(pokemon, left_on='Second_pokemon', right_on='pokeindex', suffixes=('_1', '_2'))

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.groupby(['Type 1_1', 'Type 1_2']).agg({'Winner_first': ['mean', 'count']}
                                        ).sort_values(by=('Winner_first', 'mean'),
                                                      ascending=False)

## Concatenation

In [None]:
sample = df[['Type 1_1', 'Type 1_2', 'Winner_first']].copy()

In [None]:
sample_reverse = sample[['Type 1_2', 'Type 1_1', 'Winner_first']]
sample_reverse.columns = ['Type 1_1', 'Type 1_2', 'Winner_first']
sample_reverse['Winner_first'] = 1 - sample_reverse.Winner_first

In [None]:
sample_symmetric = pd.concat([sample_reverse, sample], axis=0)

In [None]:
sample_symmetric.groupby(['Type 1_1', 'Type 1_2']).agg({'Winner_first': ['mean', 'count']}
                                        ).sort_values(by=('Winner_first', 'mean'),
                                                      ascending=False)

## Pivoting

In [None]:
pd.pivot_table(sample_symmetric,
               index='Type 1_1', 
               columns='Type 1_2',
               values='Winner_first',
               aggfunc='mean')

## Categorial data

### cut

In [None]:
pokemon['HP_category'] = pd.cut(pokemon.HP, bins=5)
pokemon.head()

In [None]:
pokemon.groupby('HP_category').Attack.mean()

### get dummies

In [None]:
pokemon['Type 1'].value_counts()

In [None]:
pokemon_type_cats = pd.get_dummies(pokemon, columns=['Type 1'], prefix='type')
pokemon_type_cats.head()

In [None]:
gen_stats = pokemon_type_cats.groupby('Generation').agg({'type_Ghost': 'mean',
                                                         'type_Grass': 'mean',
                                                         'type_Ground': 'mean',
                                                         'type_Ice': 'mean',
                                                         'type_Psychic': 'mean',
                                                         'Name': 'count'}
                                                       )
gen_stats

## Numpy connection

In [None]:
pokemon.values

## Saving

In [None]:
gen_stats.to_csv('generation_stats.csv', index=False)