In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Our aim is to examine Pandas Foundation and Manipulating Data Frames with Pandas

In [None]:
data = pd.read_csv('/kaggle/input/suicide-rates-overview-1985-to-2016/master.csv')
data.head()  # see first 5 rows

Let's build data frames from scratch
* We can build data frames from csv as we did above.
* Also we can build dataframe from dictionaries
* We can use zip() method to returns a list of tuples, where the i-th tuple contains the i-th element from each of the argument sequences or iterables.
* Adding new column
* Broadcasting: Create new column and assign a value to entire column

In [None]:
# Let's create three different lists.
team = ["Fenerbahce","Galatasaray"]
team_value = ["150M","180M"]
list_label = ["team","team_value"]
list_col = [team,team_value]
zipped = list(zip(list_label,list_col))
data_dict = dict(zipped)
df = pd.DataFrame(data_dict)
df

We can add new columns.

In [None]:
# Add new columns
df["Player_Number"] = ["25","23"]
df

In [None]:
# Broadcasting
df["Expenses"] = 100000000 #Broadcasting entire column
df

Let's visualise our dataset.

In [None]:
# Plotting all data 
data1 = data.loc[:,["suicides_no","population"]]
data1.plot()
plt.show()
# So confusing

In [None]:
# subplots
data1.plot(subplots = True)
plt.show()

In [None]:
# scatter plot  
data1.plot(kind = "scatter",x="population",y = "suicides_no")
plt.show()

In [None]:
# hist plot  
data1.plot(kind = "hist",y = "suicides_no",range= (0,500),bins = 10)

In [None]:
fig, axes = plt.subplots(nrows=2,ncols=1)
data1.plot(kind = "hist",y = "suicides_no",bins = 50,range= (0,500),ax = axes[0])
data1.plot(kind = "hist",y = "suicides_no",bins = 50,range= (0,500),ax = axes[1],cumulative = True)
plt.savefig('graph.png')
plt.show()

STATISTICAL EXPLORATORY DATA ANALYSIS

In [None]:
data.describe()

INDEXING PANDAS TIME SERIES

In [None]:
time_list = ["1985-01-01","2016-12-31"]
print(type(time_list[1])) # As you can see date is string
# however we want it to be datetime object
datetime_object = pd.to_datetime(time_list)
print(type(datetime_object))

In [None]:
# close warning
import warnings
warnings.filterwarnings("ignore")
# In order to practice lets take head of pokemon data and add it a time list
data2 = data.head()
date_list = ["1987-01-01","1987-02-01","1987-03-01","1987-04-01","1987-05-01"]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object
# lets make date as index
data2= data2.set_index("date")
data2 

In [None]:
print(data2.loc["1987-01-01"])
print(data2.loc["1987-01-01":"1987-03-01"])

RESAMPLING PANDAS TIME SERIES
* Resampling: statistical method over different time intervals
* Needs string to specify frequency like "M" = month or "A" = year
* Downsampling: reduce date time rows to slower frequency like from daily to weekly
* Upsampling: increase date time rows to faster frequency like from daily to hourly
* Interpolate: Interpolate values according to different methods like ‘linear’, ‘time’ or index’

In [None]:
# We will use data2 that we create at previous part
data2.resample("A").mean()

In [None]:
# Lets resample with month
data2.resample("M").mean()

In [None]:
# In real life (data is real. Not created from us like data2) we can solve this problem with interpolate
# We can interpolete from first value
data2.resample("M").first().interpolate("linear")

In [None]:
# Or we can interpolate with mean()
data2.resample("M").mean().interpolate("linear")

Let's manipulate DATA FRAMES WITH PANDAS

In [None]:
# read data
data = pd.read_csv('/kaggle/input/suicide-rates-overview-1985-to-2016/master.csv')
data= data.set_index("year")
data.head()

In [None]:
# indexing using square brackets
data["sex"][1987]

In [None]:
# using column attribute and row label
data.sex[1987]

In [None]:
# using loc accessor
data.loc[1987,["sex"]]

In [None]:
# Selecting only some columns
data[["sex","age"]]

SLICING DATA FRAME

In [None]:
# Difference between selecting columns: series and dataframes
print(type(data["sex"]))     # series
print(type(data[["sex"]]))   # data frames

In [None]:
# Slicing and indexing series
data.loc[:,"sex":"suicides_no"]

In [None]:
data.loc[::-1,"sex":"suicides_no"]

FILTERING DATA FRAMES

In [None]:
boolean = data.suicides_no > 100
data[boolean]

In [None]:
first_filter = data.suicides_no > 200
second_filter = data.population > 1000000
data[first_filter & second_filter]

TRANSFORMING DATA

In [None]:
# Plain python functions
def div(n):
    return n/2
data.suicides_no.apply(div)

In [None]:
# Or we can use lambda function
data.suicides_no.apply(lambda n : n/2)

In [None]:
# Defining column using other columns
data["Ratio"] = data.population / data.suicides_no
data.head()

In [None]:
# our index name is this:
print(data.index.name)
# lets change it
data.index.name = "index_name"
data.head()

HIERARCHICAL INDEXING

In [None]:
# lets read data frame one more time to start from beginning
data = pd.read_csv('/kaggle/input/suicide-rates-overview-1985-to-2016/master.csv')
data.head()
# As you can see there is index. However we want to set one or more column to be index

In [None]:
# Setting index : type 1 is outer type 2 is inner index
data1 = data.set_index(["country","sex"]) 
data1.head(100)
# data1.loc["Fire","Flying"] # howw to use indexes