In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing


In [None]:
import matplotlib.pyplot as plt
data = pd.read_csv("../input/Pokemon.csv") # Import dataset
data.head(5)

# **Matplotlib**

**There are 3 plot type in Matplotlib**
* Line Plot
* Histogram
* Scatter Plot

Let's plot the graph showing the relationship between Hp and Attack with Line Plot, Histogram and Scatter Plot

In [None]:
data.HP.plot(kind='line',grid=True,label='HP',color="red",linestyle='-',alpha=0.5,linewidth=1) # Hp
data.Attack.plot(kind='line',color='green',grid=True,label='Attack',linestyle='-',alpha=0.5,linewidth=1) # Attack
plt.legend(loc='upper left') # We use this function for distinguish lines.
plt.xlabel('HP')
plt.ylabel('Attack')
plt.show()

**Scatter Plot**

In [None]:
data.plot(kind='scatter',x = 'HP',y = 'Attack',alpha = 0.7,color = 'red')
plt.xlabel("HP")
plt.ylabel('Attack')
plt.title("Hp Attack Scatter Plot")
plt.show()

**Histogram Plot**

In [None]:
data.Defense.plot(kind='hist',bins=100,figsize=(8,5))
plt.show()

**Dictionary**

In [None]:
dictionary = {"Key1":"Value1", "Key2":"Value2", "Key3":"Value3"}
print(dictionary.keys())
print(dictionary.values())

In [None]:
print(dictionary["Key1"])

dictionary["Key2"] = "none"
print(dictionary["Key2"])

dictionary["Key4"] = "Value4"
print(dictionary["Key4"])

dictionary.clear()
print(dictionary)

# **Pandas**

**Series and Data Frames**

In [None]:
series = data["Attack"] # data["Attack"] = Series
data_frame = data[["Attack"]] # data[["Attack"]] = Data_Frame
print(type(series),"\n",type(data_frame))

**Filtering Dataset with Pandas**

In [None]:
data.head(5)

In [None]:
filtering = data["Attack"]>170
data[filtering]

In [None]:
data[(data["Attack"]>150) & (data["HP"]>100)]
data[np.logical_and(data["Attack"]>150, data["HP"]>100)] # Both are same thing

# **Loops**

In [None]:
list = [1,2,3,4,5]
for i in list:
    print(i)

In [None]:
for index,value in enumerate(list): # We use enumerate method for print both index and value
    print(index,":",value)

In [None]:
dictionary = {"Key1":"Value1","Key2":"Value2","Key3":"Value3"}
for key,value in dictionary.items():
    print(key,":",value)

In [None]:
for index,value in data[["Attack"]][0:3].iterrows():
    print(index, " : ",value)

# **Python Data Science Toolbox**

**User Defined Function**

* Sometimes functions can return more than one value. For example:

In [None]:
def udf():
    t = (1,2,3) # We defined a tuple named t
    return t # We return 3 values through tuple
a,b,c = udf()
print(a,b,c)

**Scope**

In [None]:
x = 3 # Global variable
def scope():
    x = 1 # Local variable
    return x
print(x)
print(scope())
print("")
# Global variables can access by everywhere but Local variables can only accessible from local

# If there is no local variable you will access to global variable. for example:
y = 3
def scope2():
    z = 3*y # y variable is global variable
    return z
print(scope2())

# Extra Information: You can't use built in scope names and Python's preset method names as variable names.
# To see built in scopes:
import builtins
dir(builtins) 

**Nested Function**

* It mean function inside function

In [None]:
def sqrt():
    def total():
        a = 10
        b = 6
        c = a+b
        return c
    return total()**0.5 # We take the sqrt of the value returned from the total function

print(sqrt())

**Default And Flexible Arguments**

* We can define a default value for the parameter that the function takes. For example:

In [None]:
# Default Arguments:
def f(x,y = 2):
    return x+y
print(f(1)) # I didn't assign a value to variable y
print(f(1,4)) # We can change the default value of variable y

In [None]:
# Flexible Arguments
# We use flexible arguments for send as many values as we want to a function. For example:
def f(*args):
    for i in args:
        print(i)
f(1)
f(1,2,3)
print("")

# Also We use flexible arguments for send as many dictionary items as we want to a function. For example:
def g(**kwargs):
    for key, value in kwargs.items():
        print(key,":",value)
g(key1 = "value1",key2 = "value2",key3 = "value3")

**Lambda Function**

In [None]:
# We use Lambda Function for define a function easily. For example:
f = lambda x : x**2 # First, We defined x variable and this function will return square of x
print(f(3))

g = lambda x,y : x+y # First, We defined x, y variable and this function will return sum of x and y
print(g(1,3))

**Anonymous Function**

* We use this function for send multiple values ​​to a function that takes one value.

In [None]:
# map(function, values) # If you want to send multiple values. You must send them in the list
y = list(map(lambda x:x**2,[1,2,3])) # We must convert map function to list format. Because it will return multiple values
print(y)
# If you get an error in this code, don't mind because the error may be caused by kaggle

**Iterators**

* Iterable are an objects. For example lists and dictionaries are iterable objects
* Iterator: produce next value with next() method.

In [None]:
x = [1,2,3,4,5,6]
it = iter(x)
print(next(it)) # Print next iteration
print(next(it))
print(*it) # Print remaining iteration
print("")

string = "asdfg"
it2 = iter(string)
print(next(it2)) # Print next iteration
print(*it2) # Print remaining iteration

# All objects that we can use with loops are iterable objects

**Zip**

* We use this method to map two lists. For example:

In [None]:
# Zip
list1 = [1,2,3,4]
list2 = [5,6,7,8] # the length of the first list must be equal to the length of the second list
f = list(zip(list1,list2)) # We must convert zip function to list format.

# Unzip
unzip = zip(*f)
unlist1,unlist2 = list(unzip) # Unzip returns tuple
print(unlist1)
print(unlist2)
# If you get an error in this code, don't mind because the error may be caused by kaggle

**List Comprehension**

* List comprehensions are used for creating new lists from other iterables. For example:

In [None]:
list1 = [1,2,3,4] # Lists are iterable object
f = [i**2 for i in list1]
print(f)
print("")

# Conditional List Comprehension
list2 = [10,15,20,25]
g = [i if i%10==0 else i**2 if i==15 else i+3 for i in list2]
print(g)
# Both is same
# def g(*args):
#     for i in args:
#         if(i % 10 == 0):
#             print(i)
#         elif(i == 15):
#             print(i**2)
#         else:
#             print(i+3)
# g(10,20,36,64)

Let's try to use List Comprehension with Pandas

In [None]:
avg = sum(data.Attack)/len(data.Attack) # We found the average attack
data["Avg_Attack"] = ["High" if i > avg else "Low" for i in data.Attack] # If pokemon's attack is higher than average, Avg_Attack will be High. If not Avg_Attack will be Low
print(avg)
data.head(5)

# **Cleaning Data**

* The data we obtain may not always be clean. In these cases, we need to pre-process the data

### Explotary Data Analysis
value_counts(): Frequency counts
<br>outliers: the value that is considerably higher or lower from rest of the data
* Lets say value at 75% is Q3 and value at 25% is Q1. 
* Outlier are smaller than Q1 - 1.5(Q3-Q1) and bigger than Q3 + 1.5(Q3-Q1). (Q3-Q1) = IQR
<br>We will use describe() method. Describe method includes:
* count: number of entries
* mean: average of entries
* std: standart deviation
* min: minimum entry
* 25%: first quantile
* 50%: median or second quantile
* 75%: third quantile
* max: maximum entry

<br> What is quantile?

* 1,4,5,6,8,9,11,12,13,14,15,16,17
* The median is the number that is in **middle** of the sequence. In this case it would be 11.

* The lower quartile is the median in between the smallest number and the median i.e. in between 1 and 11, which is 6.
* The upper quartile, you find the median between the median and the largest number i.e. between 11 and 17, which will be 14 according to the question above.

In [None]:
# For example lets look frequency of pokemom types
print(data['Type 1'].value_counts(dropna =False))  # if there are nan values that also be counted
# As it can be seen below there are 112 water pokemon or 70 grass pokemon

In [None]:
data.describe() # ignore null entries

### Visual Exploratory Data Analysis
* Box plots: visualize basic statistics like outliers, min/max or quantiles

In [None]:
# For example: compare attack of pokemons that are legendary  or not
# Black line at top is max
# Blue line at top is 75%
# Red line is median (50%)
# Blue line at bottom is 25%
# Black line at bottom is min
# There are no outliers
data.boxplot(column='Attack',by = 'Legendary')
plt.show()

### Tidy Data
We tidy data with melt().
Describing melt is confusing. Therefore lets make example to understand it.

In [None]:
# Firstly I create new data from pokemons data to explain melt nore easily.
data_new = data.head(5) # I only take 5 rows into new data
melted = pd.melt(frame = data_new,id_vars = 'Name',value_vars= ['Attack','Defense'])
melted

### Privoting Data
Reverse of melting.

In [None]:
melted.pivot(index = 'Name', columns = 'variable',values='value')

### Concatenating Data
We can concatenate two dataframe **

In [None]:
data1 = data.head()
data2 = data.tail()
conc_data_row = pd.concat([data1,data2],axis = 0,ignore_index = True)
conc_data_row

In [None]:
data1 = data['Attack'].head()
data2= data['Defense'].head()
conc_data_col = pd.concat([data1,data2],axis =1) # axis = 0 : adds dataframes in row
conc_data_col

### Data Types
There are 5 basic data types: object(string),booleab,  integer, float and categorical.
<br> We can make conversion data types like from str to categorical or from int to float
<br> Why is category important: 
* make dataframe smaller in memory 
* can be utilized for analysis especially for sklearn(we will learn later)

In [None]:
data.dtypes

In [None]:
# lets convert object(str) to categorical and int to float.
data["Type 1"] = data["Type 1"].astype('category')
data['Speed'] = data['Speed'].astype('float')
data.dtypes

### Missing Data And Testing With Assert
If we encounter with missing data, what we can do:
* leave as is
* drop them with dropna()
* fill missing value with fillna()
* fill missing values with test statistics like mean
<br>Assert statement: check that you can turn on or turn off when you are done with your testing of the program

In [None]:
data.info()

In [None]:
# Lets check Type 2
data["Type 2"].value_counts(dropna =False)
# As you can see, there are 386 NAN value

In [None]:
# Lets drop nan values
data1=data   # also we will use data to fill missing value so I assign it to data1 variable
data1["Type 2"].dropna(inplace = True)  # inplace = True means we do not assign it to new variable. Changes automatically assigned to data
# So does it work ?

In [None]:
#  Lets check with assert statement
# Assert statement:
assert 1==1 # return nothing because it is true

In [None]:
assert  data['Type 2'].notnull().all() # returns nothing because we drop nan values

In [None]:
data["Type 2"].fillna('empty',inplace = True)

In [None]:
assert  data['Type 2'].notnull().all() # returns nothing because we do not have nan values

In [None]:
# # With assert statement we can check a lot of thing. For example
# assert data.columns[1] == 'Name'
# assert data.Speed.dtypes == np.int

# **Pandas Foundation**

### Review of Pandas
As you notice, I do not give all idea in a same time. Although, we learn some basics of pandas, we will go deeper in pandas.
* single column = series
* NaN = not a number
* dataframe.values = numpy

### Building Data Frames From Scratch
* We can build data frames from csv as we did earlier.
* Also we can build dataframe from dictionaries
    * zip() method: This function returns a list of tuples, where the i-th tuple contains the i-th element from each of the argument sequences or iterables.
* Adding new column
* Broadcasting: Create new column and assign a value to entire column

In [None]:
# data frames from dictionary
country = ["Spain","France"]
population = ["11","12"]
list_label = ["country","population"]
list_col = [country,population]
zipped = list(zip(list_label,list_col))
data_dict = dict(zipped)
df = pd.DataFrame(data_dict)
df

In [None]:
df["income"] = 0

### Visual Exploratory Data Analysis
* Plot
* Subplot
* Histogram:
    * bins: number of bins
    * range(tuble): min and max values of bins
    * normed(boolean): normalize or not
    * cumulative(boolean): compute cumulative distribution

In [None]:
data1 = data.loc[:,["Attack","Defense","Speed"]]
data1.plot()
plt.show()

In [None]:
data1.plot(subplots = True)
plt.show()

In [None]:
data1.plot(kind = "scatter",x="Attack",y="Defense")
plt.show()

In [None]:
data1.plot(kind="hist",y="Defense",bins = 50,range=(0,250),normed = True)
plt.show()

In [None]:
# histogram subplot with non cumulative and cumulative
fig, axes = plt.subplots(nrows=2,ncols=1)
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),normed = True,ax = axes[0])
data1.plot(kind = "hist",y = "Defense",bins = 50,range= (0,250),normed = True,ax = axes[1],cumulative = True)
plt.savefig('graph.png')
plt

In [None]:
data.head()

### Index'ng Pandas Time Series
* datetime = object
* parse_dates(boolean): Transform date to ISO 8601 (yyyy-mm-dd hh:mm:ss ) format

In [None]:
time_list = ["1992-03-08","1992-04-12"]
datetime_object = pd.to_datetime(time_list)
print(type(datetime_object))

In [None]:
# close warning
import warnings
warnings.filterwarnings("ignore")
# In order to practice lets take head of pokemon data and add it a time list
data2 = data.head()
date_list = ["1992-01-10","1992-02-10","1992-03-10","1993-03-15","1993-03-16"]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object
# lets make date as index
data2= data2.set_index("date")
data2

In [None]:
print(data2.loc["1993-03-16"])
print(data2.loc["1992-03-10":"1993-03-16"])

### Resampling Pandas Time Series
* Resampling: statistical method over different time intervals
    * Needs string to specify frequency like "M" = month or "A" = year
* Downsampling: reduce date time rows to slower frequency like from daily to weekly
* Upsampling: increase date time rows to faster frequency like from daily to hourly
* Interpolate: Interpolate values according to different methods like ‘linear’, ‘time’ or index’ 
    * https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.interpolate.html

In [None]:
data2.resample("A").mean()

In [None]:
data2.resample("M").mean()

In [None]:
# In real life (data is real. Not created from us like data2) we can solve this problem with interpolate
# We can interpolate from first value
data2.resample("M").first().interpolate("linear")

In [None]:
data2.resample("M").mean().interpolate("linear")

# Manipulating Data Frames With Pandas

### Indexing Data Frames
* Indexing using square brackets
* Using column attribute and row label
* Using loc accessor
* Selecting only some columns

In [None]:
data = pd.read_csv('../input/Pokemon.csv')
data = data.set_index("#")
data.head()

In [None]:
data["HP"][1]

In [None]:
data.loc[1,["HP"]]

In [None]:
data[["HP","Attack"]]

### Slicing Data Frame
* Difference between selecting columns
    * Series and data frames
* Slicing and indexing series
* Reverse slicing 
* From something to end

In [None]:
print(type(data["HP"]))
print(type(data[["HP"]]))

In [None]:
data.loc[1:5,["HP","Defense"]]

In [None]:
# From something to end
data.loc[1:10,"Speed":]

### Filtering Data Frames
Creating boolean series
Combining filters
Filtering column based others

In [None]:
data["HP"][data["Speed"]<15]

### Transforming Data
* Plain python functions
* Lambda function: to apply arbitrary python function to every element
* Defining column using other columns

In [None]:
# Plain python functions
def div(n):
    return n/2
data["HP"].apply(div)

In [None]:
data["HP"].apply(lambda n : n/2)

In [None]:
data["total_power"] = data.Attack + data.Defense
data.head()

### Index Object And Labeled Data
index: sequence of label

In [None]:
print(data.index.name)
# Let's change it
data.index.name = "index_name"
print(data.index.name)

In [None]:
data.head()
data3 = data.copy()
data3.index = range(100,900)
data3.head()

In [None]:
# We can make one of the column as index. I actually did it at the beginning of manipulating data frames with pandas section
# It was like this
# data= data.set_index("#")
# also you can use 
# data.index = data["#"]

### Hierarchical Indexing
* Setting indexing

In [None]:
# lets read data frame one more time to start from beginning
data = pd.read_csv('../input/Pokemon.csv')
data.head()
# As you can see there is index. However we want to set one or more column to be index

In [None]:
data1 = data.set_index(["Type 1","Type 2"])
data1.head(20)

### Pivoting Data Frames
* pivoting: reshape tool

In [None]:
dic = {"treatment":["A","A","B","B"],"gender":["F","M","F","M"],"response":[10,45,5,9],"age":[15,4,72,65]}
df = pd.DataFrame(dic)
df

In [None]:
df.pivot(index="treatment",columns = "gender",values="response")

### Stacking and Unstacking DataFrame
* deal with multi label indexes
* level: position of unstacked index
* swaplevel: change inner and outer level index position

In [None]:
df1 = df.set_index(["treatment","gender"])
df1

In [None]:
# levels determines indexes
df1.unstack(level=0)

In [None]:
df2 = df1.swaplevel(0,1)
df2

### Melting Data Frames
* Reverse of pivoting

In [None]:
df

In [None]:
pd.melt(df,id_vars = "treatment",value_vars = ["age","response"])

### Categoricals and Groupby

In [None]:
df

In [None]:
df.groupby("treatment").mean()

In [None]:
df.groupby("treatment")["age","response"].mean()