In [None]:
# What is Pandas and How does it work ?
# Pandas is an open source Python library that allows users to explore, manipulate and visualise data in an extremely efficient manner. It is literally Microsoft Excel in Python.

!conda install pandas
!python3 -m pip install pandas


In [1]:
import pandas as pd
import numpy as np

# Dataframes and Series
# Series are like columns while Dataframes are your full blown tables in Pandas.

In [35]:
l = [100, 200, 300]
index = [1, 2, 3] # [0,1,2]
pd.Series(data=l, index=index)

dictionary = {'a': 100, 'b': 200, 'c': 300}
data = pd.Series(data=dictionary)

data.keys()
data.values

# accessing series element
data[-1]
data['b']
data.loc['b'] # harus sesuai key atau index
data.iloc[1]
data.iloc[1:3]

b    200
c    300
dtype: int64

In [24]:
# creating dataframe
# from list
data = [['thomas', 100], ['nicholas', 200], ['danson', 300]] 
df = pd.DataFrame(data, columns = ['Name', 'Age'])
df
# from dictionary
data = {'Name':['thomas', 'nicholas', 'danson', 'jack'], 'Age':[100, 200, 300, 400]} 
df = pd.DataFrame(data)
df

df.index
df.values
df.axes[1]

# Usually, we don’t create our own dataframes. Instead, we read explore, manipulate and visualise data in Pandas by importing data to a dataframe.
# Pandas can read from multiple formats, but the usual one is csv.
# Here’s the official list of file types pandas can read from.


Index(['Name', 'Age'], dtype='object')

In [12]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://miro.medium.com/max/2200/1*dOEhkrAfRXdskQPhrHrnvg.png")


In [115]:
df = pd.read_csv('./train.csv')
df
# df.info()
df.nunique() # explain here
# df[[‘Survived’,’Pclass’,’Sex’,’Age’,’SibSp’,’Parch’,’Fare’,’Cabin’,’Embarked’]]

df.drop(['PassengerId', 'Ticket'], axis = 1, inplace=True) # The inplace=True parameter tells Pandas to auto assign what you intend to do to the original variable itself, in this case it is df.
df

# df.loc[1:2]
# df.iloc[0]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C148,C


In [51]:
# Join dataframe from series
city_revenues = pd.Series([4200, 8000, 6500], index=['Armsterdam', 'Toronto', 'Tokyo'])
city_revenues
city_employee_count = pd.Series({ 'Armsterdam': 5, 'Tokyo': 8 })
city_employee_count
city_data = pd.DataFrame({ 'revenue': city_revenues, 'employee_count': city_employee_count })
city_data


Unnamed: 0,revenue,employee_count
Armsterdam,4200,5.0
Tokyo,6500,8.0
Toronto,8000,


In [95]:
# querying & conditional
df[df['Sex'] == 'male']
df[['Survived', 'Pclass', 'Sex']][df['Sex'] == 'male']
df[['Survived', 'Pclass', 'Sex']][df['Sex'] == 'male']['Survived'].count()
df[['Survived', 'Pclass', 'Sex']][(df['Sex'] == 'male') & (df['Survived'] == 1) & (df['Age'] > 50)].value_counts()
# df[['Survived', 'Pclass', 'Sex', 'Cabin']][(df['Sex'] == 'female') & (df['Cabin'].notnull())]
# aggregate
# df.describe()
# df.max()
# df.median()
# df.count()
# help(df.std)
# df.std() # The Standard Deviation is a measure for differences in a dataset. In statistics differences or deviations are called variances or variations.

# data cleaning
# Your dataset can often include dirty data like:
# null values
# empty values
# incorrect timestamp
# many many more
df.isnull().sum()
df[df['Age'].isnull()]
df.dropna(inplace=True)
df.dropna(axis=1, inplace=True) # drop columns
df[‘Age’].fillna(df[‘Age’].mean())
df[‘Age’].fillna(df[‘Age’].mean())


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
5,0,3,male,,0,0,8.4583,,Q
17,1,2,male,,0,0,13.0000,,S
19,1,3,female,,0,0,7.2250,,C
26,0,3,male,,0,0,7.2250,,C
28,1,3,female,,0,0,7.8792,,Q
...,...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,,C
863,0,3,female,,8,2,69.5500,,S
868,0,3,male,,0,0,9.5000,,S
878,0,3,male,,0,0,7.8958,,S


In [117]:
# group by
df.groupby('Pclass').mean()
# or
df.groupby('Pclass')['Age'].mean()
df.groupby(['Pclass','Sex']).mean()
#concat

first_5 = df.head()
last_5 = df[178:]
combined = pd.concat([first_5,last_5], axis = 0)

# merge

data = [['Braund, Mr. Owen Harris', 80, 177.0], ['Heikkinen, Miss. Laina', 78, 180.0], ['Montvila, Rev. Juozas', 87, 165.0]] 
df2 = pd.DataFrame(data, columns = ['Name', 'weight', 'height'])
# df3 = pd.merge(df,df2, how='right', on='Name')
df3 = pd.merge(df, df2, how='right', on='Name')

def pclass_name(x):
    if x == 1:
        x = '1st Class'
    if x == 2:
        x = '2nd Class'
    if x == 3:
        x = '3rd Class'
    return x
df3['Pclass'] = df3['Pclass'].apply(lambda x: pclass_name(x))

df3

# apply function

# custom columns
# exports

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,weight,height
0,0,3rd Class,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S,80,177.0
1,1,3rd Class,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S,78,180.0
2,0,2nd Class,"Montvila, Rev. Juozas",male,27.0,0,0,13.0,,S,87,165.0
