# Dataframes and data visualization

## Setup Steps

This assumes you have a working Jupyter setup on your local machine. If you don't have these packages installed, go ahead and open the "Anaconda Navigator" application and install these packages: plotly, pandas, numpy, matplotlib. You may need to reload your VS Code window after installation for those packages to work.

This is a huge help and a number of our cells come from it! https://www.kaggle.com/code/orhansertkaya/visualizing-pok-mon-stats-with-plotly/notebook



## Loading data into Pandas

If you are using repl, copy the contents of the csv into your repl and give it the same name of `pokemon_data.csv`

In [None]:
import pandas as pd

# Read csv data
df = pd.read_csv('pokemon_data.csv')

Pandas can easily read from different file formats. Here are some examples of what it looks like to read from other files. Don't run this code as the files are not defined - it will throw you an error.

In [None]:
# Example of reading excel data
# df_xlsx = pd.read_excel('pokemon_data.xlsx')
# df_xlsx.head()

# Example of reading txt data
# df_txt = pd.read_csv('pokemon_data.txt', delimiter='\t')
# df_txt.head()

## Understanding Data in Pandas

In [None]:
# Read headers
df.columns

# Read these columns from all rows
# print(df[['Name', 'Type 1', 'HP']])

# Read every row
# print(df.iloc[0:4])
# for index, row in df.iterrows():
#     print("\nNew Row")
#     print(index, row)

# # Read a specific row's column (row 10 column 2 value)
# print(df.iloc[10, 2])

# # Read a range of rows
# print(df.iloc[1:5])

# # Read specific column from a range of rows
# print(df.iloc[1:5, 1])

# # Read range of columns from a range of rows
# print(df.iloc[1:5, 1:3])

## Sampling Data


In [None]:
# Get a count for each unique value in the 'Generation' column
df.Generation.value_counts()

In [None]:
# Calculate the total number of pokemon that are considered legendary.
Legendary = df['Legendary'].sum()
print(f"Pokemon that are considered legendary: {Legendary}")

# Calculate total number of respondents using the count of 'ID' column (assuming each ID is unique).
total_pokemon = df['#'].count()
print(f"Total pokemon: {total_pokemon}")

## Sorting/Describing Data

In [None]:
# # Sort by type and health points (HP) so type is alphabetical ascending and hp is numberic ascending
df.sort_values(['Type 1', 'HP'], ascending=[1,0])

df.head(100)

In [None]:
# Group the data by 'Type 1' and count the values of 'Type 2' in each group.
df.groupby('Type 1')['Type 2'].value_counts()

## Making changes to the data

In [None]:
# Create a new column in each row based on a calculation that would be performed on every row
df['Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']

# Checkout the new column
df.head()

# Create a new column again
df['Sp. Power'] = df['Sp. Atk'] + df['Sp. Def']

# Drop the newly created column
df = df.drop(columns=['Sp. Power'])

# The column should be gone now
df.head()

## Saving our Data (Exporting into Desired Format)

As we described the earlier in the lesson, pandas can easily read from other file formats and read data. However, it can also __export__ to these different formats just as easily.

In [None]:
# df.to_csv('modified.csv', index=False)

# df.to_excel('modified.xlsx', index=False)

# df.to_csv('modified.txt', index=False, sep='\t')

## Filtering Data

In [None]:
# Note that in dataframes there is no "and", only &. Lets filter for some data.
new_df = df.loc[(df['Type 1'] == 'Grass') & (df['Type 2'] == 'Poison') & (df['HP'] > 70)]

# Reassign the row numbers
new_df.reset_index(drop=True, inplace=True)

new_df.head()

# Save this filtered data to a new csv
new_df.to_csv('filtered.csv')



## Conditional Changes

In [None]:

# Conditionally change a column
df.loc[df['Total'] > 500, ['Generation','Legendary']] = ['Test 1', 'Test 2']

# Display the column
df['HP']

## Aggregate Statistics (Groupby)


In [None]:
# Create new column with every value initialized to 1
df['count'] = 1

# Group and display a column's count based on number of Type 1 and Type 2 pokemon
df.groupby(['Type 1', 'Type 2']).count()['count']






## Working with large amounts of data



In [None]:
new_df = pd.DataFrame(columns=df.columns)

# Chunk a dataframe from a csv
for dfr in pd.read_csv('pokemon_data.csv', chunksize=5):
    results = dfr.groupby(['Type 1']).count()
    new_df = pd.concat([new_df, results])


### Let's do some basic data visualization

If you want a quick refresher on different types of charts/graphs and when to use them, read [this](https://www.tableau.com/learn/whitepapers/which-type-chart-or-graph-right-for-you-ungated)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

### Line Chart

Lets demonstrate trends in stats like HP or Attack with different classifiers such as the Pokemon's Generation.

In [None]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

# Visualizing the trend of stats grouped by generation.
data = []
stats_names = ['HP', 'Attack', 'Defense']
stats = df.groupby("Generation")[stats_names].mean().reset_index()
for stat in stats_names:
  stat_line = go.Scatter(
    x=stats["Generation"],
    y=stats[stat],
    name=stat,
    line=dict(
      width=3,
    ),
  )

  data.append(stat_line)

layout = go.Layout(
  title='Trend of stats by {}'.format("Generation"),
  xaxis=dict(title="Generation"),
  yaxis=dict(title='Values')
)

trend = go.Figure(data=data, layout=layout)
iplot(trend, filename='trend')


# stats_by('Generation')
# stats_by('Type 1')

In [None]:
# We can turn that code into a neat function that allows us to pass
# the classifier and stat names in dynamically.

def stats_by(classifier, stats_names):
  data = []
  stats = df.groupby(classifier)[stats_names].mean().reset_index()
  for stat in stats_names:
    stat_line = go.Scatter(
      x=stats[classifier],
      y=stats[stat],
      name=stat,
      line=dict(
        width=3,
      ),
    )

    data.append(stat_line)

  layout = go.Layout(
    title='Trend of stats by {}'.format(classifier),
    xaxis=dict(title=classifier),
    yaxis=dict(title='Values')
  )

  trend = go.Figure(data=data, layout=layout)
  iplot(trend, filename='trend')

stats_by('Type 1', ['Sp. Atk', 'Sp. Def', 'Speed'])

### Bar Chart

Now let's inspect the number of Pokemon in each type (i.e. Water or Fire).

In [None]:
# Lets assign each Pokemon type a color for visualization in
# the bar chart.
# Defining colors for graphs 
colors = {
    "Bug": "#A6B91A",
    "Dark": "#705746",
    "Dragon": "#6F35FC",
    "Electric": "#F7D02C",
    "Fairy": "#D685AD",
    "Fighting": "#C22E28",
    "Fire": "#EE8130",
    "Flying": "#A98FF3",
    "Ghost": "#735797",
    "Grass": "#7AC74C",
    "Ground": "#E2BF65",
    "Ice": "#96D9D6",
    "Normal": "#A8A77A",
    "Poison": "#A33EA1",
    "Psychic": "#F95587",
    "Rock": "#B6A136",
    "Steel": "#B7B7CE",
    "Water": "#6390F0",
}

In [None]:
# Visualizing number of pokemon per type across all generation.
types = (df.groupby(['Type 1'])['#'].count())
types_name = list(types.keys())

data = go.Bar(
    x=types_name,
    y=types.values,
    marker=dict(
        color=list(colors.values())
    ),
    name="{}".format(types_name)
)

layout = go.Layout(
    title='Types',
    xaxis=dict(
        title='Type'
    ),
    yaxis=dict(
        title='Number of Pokemon'
    )
)

fig = go.Figure(data=[data], layout=layout)
iplot(fig, filename='Types')

### Box plot

In [None]:
# data preparation
trace1 = go.Box(
    y=df.Speed,
    name = 'Speed',
    marker = dict(
        color = 'rgb(12, 12, 140)',
    )
)
trace2 = go.Box(
    y=df.HP,
    name = 'HP',
    marker = dict(
        color = 'rgb(12, 128, 128)',
    )
)
data = [trace1, trace2]
iplot(data)

### Scatter plot

In [None]:
# Read csv data
df = pd.read_csv('pokemon_data.csv')

# Create a new column (again) to represent "Total" strength.
df['Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']
df.head()

pk1 = df[df.Generation == 1]
pk2 = df[df.Generation == 2]
pk3 = df[df.Generation == 3]
pk4 = df[df.Generation == 4]
pk5 = df[df.Generation == 5]

# creating trace1
trace1 =go.Scatter(
                    x = pk1.HP,
                    y = pk1.Total,
                    mode = "markers",
                    name = "Generation 1",
                    marker = dict(color = 'rgba(83, 37, 85, 1)'),
                    text= pk1.Name)
# creating trace2
trace2 =go.Scatter(
                    x = pk2.HP,
                    y = pk2.Total,
                    mode = "markers",
                    name = "Generation 2",
                    marker = dict(color = 'rgba(168, 0, 0, 1)'),
                    text= pk2.Name)
# creating trace3
trace3 =go.Scatter(
                    x = pk3.HP,
                    y = pk3.Total,
                    mode = "markers",
                    name = "Generation 3",
                    marker = dict(color = 'rgba(35, 117, 0, 1)'),
                    text= pk3.Name)
# creating trace4
trace4 =go.Scatter(
                    x = pk4.HP,
                    y = pk4.Total,
                    mode = "markers",
                    name = "Generation 4",
                    marker = dict(color = 'rgba(5, 84, 133, 1)'),
                    text= pk4.Name)
# creating trace5
trace5 =go.Scatter(
                    x = pk5.HP,
                    y = pk5.Total,
                    mode = "markers",
                    name = "Generation 5",
                    marker = dict(color = 'rgba(7, 181, 187, 1)'),
                    text= pk5.Name)
data = [trace1, trace2, trace3, trace4, trace5]
layout = dict(title = 'Total Power vs Generation of top 100 Pokemons with gen1,gen2,gen3,gen4 and gen5',
              xaxis= dict(title= 'Pokemon Number',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'Total_Power',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)

In [None]:
# prepare data
pk1 = df.Total[df.Generation == 1]
pk2 = df.Total[df.Generation == 2]

trace1 = go.Histogram(
    x=pk1,
    opacity=0.75,
    name = "Pokemon Total Power(Generation=1)",
    marker=dict(color='rgba(171, 50, 96, 0.6)'))
trace2 = go.Histogram(
    x=pk2,
    opacity=0.75,
    name = "Pokemon Total Power(Generation=2)",
    marker=dict(color='rgba(12, 50, 196, 0.6)'))

layout = go.Layout(barmode='overlay',
                   title='Pokemon Total Power by Generation 1',
                   xaxis=dict(title='Pokemon Total Power'),
                   yaxis=dict(title='Count'),
)

# Single histogram
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)

In [None]:
layout = go.Layout(barmode='overlay',
                   title='Pokemon Total Power by Generation 1 and Generation 2',
                   xaxis=dict(title='Pokemon Total Power'),
                   yaxis=dict(title='Count'),
)

# Double histogram
fig = go.Figure(data=[trace1, trace2], layout=layout)
iplot(fig)

In [None]:
import plotly.figure_factory as ff

# Displot is a histogram fitted with a KDE
hp_distplot = ff.create_distplot([df.HP], ['HP'], bin_size=5)
iplot(hp_distplot, filename='HP Distplot')

In [None]:
# Visualizing with a radar chart.
def polar_pokemon_stats(pk_name):
  pkmn = df[df.Name == pk_name]
  obj = go.Scatterpolar(
    r=[
      pkmn['HP'].values[0],
      pkmn['Attack'].values[0],
      pkmn['Defense'].values[0],
      pkmn['Sp. Atk'].values[0],
      pkmn['Sp. Def'].values[0],
      pkmn['Speed'].values[0],
      pkmn['HP'].values[0]
    ],
    theta=[
      'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'HP'
    ],
    fill='toself',
    marker=dict(
      color=colors[pkmn['Type 1'].values[0]]
    ),
    name=pkmn['Name'].values[0]
  )

  return obj

# Comparing stats of 2 different pokemons
pkmn_1_name = 'Kyogre'
pkmn_2_name = 'Entei'

layout = go.Layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 250]
        )
    ),
    showlegend=True,
    title="{} vs. {}".format(pkmn_1_name, pkmn_2_name)
)

pkmn_1 = polar_pokemon_stats(pkmn_1_name)
pkmn_2 = polar_pokemon_stats(pkmn_2_name)

compare_2_pokemon = go.Figure(data=[pkmn_1, pkmn_2], layout=layout)
iplot(compare_2_pokemon, filename='Compare 2 Pokemon')