In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
data = pd.read_csv("../input/heart.csv")

In [None]:
data.info()

In [None]:
data.corr()

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

In [None]:
# Line Plot
# color = color, label = label, linewidth = width of line, alpha = opacity, grid = grid, linestyle = sytle of line
data.age.plot(kind = 'line', color = 'g',label = 'thalach',linewidth=1,alpha = 0.9,grid = True,linestyle = ':')
data.chol.plot(color = 'r',label = 'ca',linewidth=1, alpha = 0.9,grid = True,linestyle = '-.')
plt.legend(loc='upper right')     # legend = puts label into plot
plt.xlabel('x axis')              # label = name of label
plt.ylabel('y axis')
plt.title('Line Plot')            # title = title of plot
plt.show()

In [None]:
# Scatter Plot 
# x = attack, y = defense
data.plot(kind='scatter', x='age', y='chol',alpha = 0.5,color = 'red')
plt.xlabel('age')              # label = name of label
plt.ylabel('chol')
plt.title('age chol Scatter Plot')            # title = title of plot

In [None]:
# Histogram
# bins = number of bar in figure
data.chol.plot(kind = 'hist',bins = 100,figsize = (12,12))
plt.show()

In [None]:
# clf() = cleans it up again you can start a fresh
data.chol.plot(kind = 'hist',bins = 50)
plt.clf()
# We cannot see plot due to clf()


In [None]:
data.columns

In [None]:
data.shape

In [None]:
print(data['exang'].value_counts(dropna =False))

In [None]:
data.describe()

In [None]:
data.boxplot(column='chol',by = 'sex')

In [None]:
data_new = data.head()    
data_new

In [None]:
melted = pd.melt(frame=data_new,id_vars = 'oldpeak', value_vars= ['age','chol'])
melted

In [None]:
melted.pivot(index = 'oldpeak', columns = 'variable',values='value')

In [None]:
data1 = data.head()
data2= data.tail()
conc_data_row = pd.concat([data1,data2],axis =0,ignore_index =True) # axis = 0 : adds dataframes in row
conc_data_row

In [None]:
data1 = data['age'].head()
data2= data['chol'].head()
conc_data_col = pd.concat([data1,data2],axis =1) # axis = 0 : adds dataframes in row
conc_data_col

In [None]:
data.dtypes

In [None]:
data.info()

In [None]:
data1 = data.loc[:,["oldpeak","cp","ca"]]
data1.plot()

In [None]:
data1.plot(subplots = True)
plt.show()

In [None]:
data.plot(kind = "hist",y = "thalach",bins = 50,range= (0,250),normed = True)

In [None]:
# histogram subplot with non cumulative and cumulative
fig, axes = plt.subplots(nrows=2,ncols=1)
data.plot(kind = "hist",y = "thalach",bins = 50,range= (0,250),normed = True,ax = axes[0])
data.plot(kind = "hist",y = "thalach",bins = 50,range= (0,250),normed = True,ax = axes[1],cumulative = True)
plt.savefig('graph.png')
plt

In [None]:
import warnings
warnings.filterwarnings("ignore")
# In order to practice lets take head of pokemon data and add it a time list
data2 = data.head()
date_list = ["1992-01-10","1992-02-10","1992-03-10","1993-03-15","1993-03-16"]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object
# lets make date as index
data2= data2.set_index("date")
data2 

In [None]:
print(data2.loc["1993-03-16"])
print(data2.loc["1992-03-10":"1993-03-16"])

In [None]:
# We will use data2 that we create at previous part
data2.resample("A").mean()


In [None]:
# Lets resample with month
data2.resample("M").mean()
# As you can see there are a lot of nan because data2 does not include all months

In [None]:
# In real life (data is real. Not created from us like data2) we can solve this problem with interpolate
# We can interpolete from first value
data2.resample("M").first().interpolate("linear")


In [None]:
data = pd.read_csv('../input/heart.csv')
data= data.set_index("#")
data.head()

In [None]:
# indexing using square brackets
data["chol"][1]

In [None]:
# using column attribute and row label
data.chol[1]

In [None]:
# using loc accessor
data.loc[1,["chol"]]

In [None]:
# Selecting only some columns
data[["age","chol"]]

In [None]:
# Difference between selecting columns: series and dataframes
print(type(data["chol"]))     # series
print(type(data[["chol"]]))   # data frames

In [None]:
# Slicing and indexing series
data.loc[1:10,"age":"chol"]   # 10 and "Defense" are inclusive

In [None]:
# Reverse slicing 
data.loc[10:1:-1,"age":"chol"] 

In [None]:
# From something to end
data.loc[1:10,"ca":] 

In [None]:
# Creating boolean series
boolean = data.age > 65
data[boolean]

In [None]:
# Combining filters
first_filter = data.age > 65
second_filter = data.chol > 300
data[first_filter & second_filter]

In [None]:
# Filtering column based others
data.age[data.chol<150] #cholestrol 150 den küçük yaş listesi.

In [None]:
# Plain python functions
def div(n):
    return n/2
data.chol.apply(div)

In [None]:
# Or we can use lambda function
data.chol.apply(lambda n : n/2)

In [None]:
# Defining column using other columns
data["denemelikdeger"] = data.cp + data.oldpeak
data.head()

In [None]:
# our index name is this:
print(data.index.name)
# lets change it
data.index.name = "index_name"
data.head()

In [None]:
# Overwrite index
# if we want to modify index we need to change all of them.
data.head()
# first copy of our data to data3 then change index 
data3 = data.copy()
# lets make index start from 100. It is not remarkable change but it is just example
data3.index = range(100,403,1)
data3.head()

In [None]:
# Setting index : type 1 is outer type 2 is inner index
data = pd.read_csv("../input/heart.csv")
data4 = data.set_index(["sex","restecg"]) 
data4.head(100)

In [None]:
dic = {"treatment":["A","A","B","B"],"gender":["F","M","F","M"],"response":[10,45,5,9],"age":[15,4,72,65]}
df = pd.DataFrame(dic)
df

In [None]:
# pivoting
df.pivot(index="treatment",columns = "gender",values="response")

In [None]:
df1 = df.set_index(["treatment","gender"])
df1
# lets unstack it

In [None]:
# level determines indexes
df1.unstack(level=0)

In [None]:
df1.unstack(level=1)

In [None]:
# change inner and outer level index position
df2 = df1.swaplevel(0,1)
df2

In [None]:
# df.pivot(index="treatment",columns = "gender",values="response")
pd.melt(df,id_vars="treatment",value_vars=["age","response"])

In [None]:
# We will use df
df

In [None]:
# according to treatment take means of other features
df.groupby("treatment").mean()   # mean is aggregation / reduction method
# there are other methods like sum, std,max or min

In [None]:
# we can only choose one of the feature
df.groupby("treatment").age.max()

In [None]:
# Or we can choose multiple features
df.groupby("treatment")[["age","response"]].min() 

In [None]:
df.info()
# as you can see gender is object
# However if we use groupby, we can convert it categorical data. 
# Because categorical data uses less memory, speed up operations like groupby
#df["gender"] = df["gender"].astype("category")
#df["treatment"] = df["treatment"].astype("category")
#df.info()